From 32ee2c7dffde3210e2c3b183f5f2fefd3a49af23 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 15 Oct 2012 17:04:29 -0400 Subject: [PATCH 01/59] Refactored the compression interface per sample in ReduceReadsa The CompressionStash is now responsible for keeping track of all intervals that must be kept uncompressed by all samples. In general this is a list generated by a tumor sample that will enforce all normal samples to abide. - Updated ReduceReads integration tests - Sliding Window is now using the CompressionStash (single sample). DEV-104 #resolve #time 3m --- .../reducereads/CompressionStash.java | 21 +++++++ .../reducereads/MultiSampleCompressor.java | 5 +- .../compression/reducereads/ReduceReads.java | 4 +- .../reducereads/SingleSampleCompressor.java | 7 ++- .../reducereads/SlidingWindow.java | 55 ++++++++++--------- .../ReduceReadsIntegrationTest.java | 30 +++++----- .../reducereads/SimpleGenomeLoc.java | 30 ++++++++++ 7 files changed, 107 insertions(+), 45 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java new file mode 100644 index 000000000..714a4df18 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.utils.GenomeLocComparator; + +import java.util.TreeSet; + +/** + * A stash of regions that must be kept uncompressed in all samples + * + * In general, these are regions that were kept uncompressed by a tumor sample and we want to force + * all other samples (normals and/or tumors) to also keep these regions uncompressed + * + * User: carneiro + * Date: 10/15/12 + * Time: 4:08 PM + */ +public class CompressionStash extends TreeSet { + public CompressionStash() { + super(new GenomeLocComparator()); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 7c9fc101b..2c3439010 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -55,11 +55,12 @@ public class MultiSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction) { + final boolean allowPolyploidReduction, + final CompressionStash compressionStash) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction)); + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction, compressionStash)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 5810bc94f..b6761f4a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -222,6 +222,8 @@ public class ReduceReads extends ReadWalker, ReduceRea HashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number). Long nextReadNumber = 1L; // The next number to use for the compressed read name. + CompressionStash compressionStash = new CompressionStash(); + SortedSet intervalList; private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag @@ -328,7 +330,7 @@ public class ReduceReads extends ReadWalker, ReduceRea */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION, compressionStash)); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 6a086c53b..82a433300 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -20,6 +20,7 @@ public class SingleSampleCompressor implements Compressor { final private ReduceReads.DownsampleStrategy downsampleStrategy; final private int nContigs; final private boolean allowPolyploidReduction; + final CompressionStash compressionStash; private SlidingWindow slidingWindow; private int slidingWindowCounter; @@ -33,7 +34,8 @@ public class SingleSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction) { + final boolean allowPolyploidReduction, + final CompressionStash compressionStash) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; @@ -44,6 +46,7 @@ public class SingleSampleCompressor implements Compressor { this.downsampleStrategy = downsampleStrategy; this.nContigs = nContigs; this.allowPolyploidReduction = allowPolyploidReduction; + this.compressionStash = compressionStash; } /** @@ -65,7 +68,7 @@ public class SingleSampleCompressor implements Compressor { } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction, compressionStash); slidingWindowCounter++; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 6fdf85317..22c40c542 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -6,7 +6,6 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; @@ -56,6 +55,7 @@ public class SlidingWindow { private final int nContigs; private boolean allowPolyploidReductionInGeneral; + private CompressionStash compressionStash; /** * The types of synthetic reads to use in the finalizeAndAdd method @@ -87,7 +87,7 @@ public class SlidingWindow { } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) { + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction, CompressionStash compressionStash) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; @@ -118,6 +118,7 @@ public class SlidingWindow { this.nContigs = nContigs; this.allowPolyploidReductionInGeneral = allowPolyploidReduction; + this.compressionStash = compressionStash; } /** @@ -145,7 +146,7 @@ public class SlidingWindow { * @param variantSite boolean array with true marking variant regions * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. */ - private Pair getNextVariantRegion(int from, int to, boolean[] variantSite) { + private SimpleGenomeLoc getNextVariantRegion(int from, int to, boolean[] variantSite) { boolean foundStart = false; int variantRegionStartIndex = 0; for (int i=from; i(variantRegionStartIndex, i-1)); + return(new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, i-1, true)); } } - return (foundStart) ? new Pair(variantRegionStartIndex, -1) : null; + return (foundStart) ? new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, to-1, false) : null; } /** @@ -168,23 +169,22 @@ public class SlidingWindow { * @param variantSite boolean array with true marking variant regions * @return a list with start/stops of variant regions following getNextVariantRegion description */ - private List> getAllVariantRegions(int from, int to, boolean[] variantSite) { - List> regions = new LinkedList>(); + private CompressionStash getVariantRegionsFromThisSample(int from, int to, boolean[] variantSite) { + CompressionStash regions = new CompressionStash(); int index = from; while(index < to) { - Pair result = getNextVariantRegion(index, to, variantSite); + SimpleGenomeLoc result = getNextVariantRegion(index, to, variantSite); if (result == null) break; regions.add(result); - if (result.getSecond() < 0) + if (result.getStop() < 0) break; - index = result.getSecond() + 1; + index = result.getStop() + 1; } return regions; } - /** * Determines if the window can be slid given the new incoming read. * @@ -203,7 +203,7 @@ public class SlidingWindow { boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex); int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) - List> regions = getAllVariantRegions(0, breakpoint, variantSite); + CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, variantSite); finalizedReads = closeVariantRegions(regions, false); List readsToRemove = new LinkedList(); @@ -567,26 +567,31 @@ public class SlidingWindow { result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); - return result; // finalized reads will be downsampled if necessary + return result; // finalized reads will be downsampled if necessary } - private List closeVariantRegions(List> regions, boolean forceClose) { + private List closeVariantRegions(CompressionStash regions, boolean forceClose) { List allReads = new LinkedList(); if (!regions.isEmpty()) { int lastStop = -1; - for (Pair region : regions) { - int start = region.getFirst(); - int stop = region.getSecond(); - if (stop < 0 && forceClose) - stop = windowHeader.size() - 1; - if (stop >= 0) { - allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); - lastStop = stop; + for (SimpleGenomeLoc region : regions) { + int start = region.getStart(); + int stop = region.getStop(); + + if (!region.isFinished()) { + if(forceClose) // region is unfinished but we're forcing the close of this window + stop = windowHeader.size() - 1; + else + continue; // region is unfinished and we're not forcing the close of this window } + + allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); + lastStop = stop; } - for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) - windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! + + for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) + windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! } return allReads; } @@ -626,7 +631,7 @@ public class SlidingWindow { if (!windowHeader.isEmpty()) { boolean[] variantSite = markSites(getStopLocation(windowHeader) + 1); - List> regions = getAllVariantRegions(0, windowHeader.size(), variantSite); + CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), variantSite); finalizedReads = closeVariantRegions(regions, true); if (!windowHeader.isEmpty()) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index db8ea4eb8..89f251ed4 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -21,36 +21,36 @@ public class ReduceReadsIntegrationTest extends WalkerTest { executeTest(testName, spec); } - @Test(enabled = false) + @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f"); + RRTest("testDefaultCompression ", L, "1f95f3193bd9f120a73c34a0087abaf6"); } - @Test(enabled = false) + @Test(enabled = true) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76"); + RRTest("testMultipleIntervals ", intervals, "79213d6ac68d56d4d72dcf511223e424"); } - @Test(enabled = false) + @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "dab2aa8e3655139974bbe12a568363d9"); } - @Test(enabled = false) + @Test(enabled = true) public void testLowCompression() { RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63"); } - @Test(enabled = false) + @Test(enabled = true) public void testIndelCompression() { - RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973"); + RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "1255245ed4ebeacda90f0dbb4e4da081"); } - @Test(enabled = false) + @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb"))); + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("122e4e60c4412a31d0aeb3cce879e841"))); } /** @@ -61,20 +61,20 @@ public class ReduceReadsIntegrationTest extends WalkerTest { * * This bam is simplified to replicate the exact bug with the three provided intervals. */ - @Test(enabled = false) + @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6"))); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("4b590269cbe3574dbdd5bdc2bc6f5f1c"))); } /** * Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get * filtered out. */ - @Test(enabled = false) + @Test(enabled = true) public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; - executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2"))); + executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("d8d066304f7c187f182bfb50f39baa0c"))); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java new file mode 100644 index 000000000..45e105751 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * GenomeLocs are very useful objects to keep track of genomic locations and perform set operations + * with them. + * + * However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot + * be created easily for small tasks that do not require the rigors of the GenomeLocParser validation + * + * SimpleGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should + * only be used outside of the engine. + * + * User: carneiro + * Date: 10/16/12 + * Time: 2:07 PM + */ +public class SimpleGenomeLoc extends GenomeLoc { + private boolean finished; + + public SimpleGenomeLoc(String contigName, int contigIndex, int start, int stop, boolean finished) { + super(contigName, contigIndex, start, stop); + this.finished = finished; + } + + public boolean isFinished() { + return finished; + } +} From 55ac4ba70bfe28cdde85b3efc002813e857ca88e Mon Sep 17 00:00:00 2001 From: kshakir Date: Wed, 17 Oct 2012 19:39:03 -0400 Subject: [PATCH 02/59] Added another utility that can convert to RemoteFiles. QScripts will now generate remote versions of files if the caller has not already passed in remote versions (or the QScript replaces the passed in remote references... not good) Instead of having yet another plugin, combined QStatusMessenger and RemoteFileConverter under general QCommandPlugin trait. --- .../sting/queue/QCommandLine.scala | 53 ++++++++++++------- .../sting/queue/QCommandPlugin.scala | 9 ++++ .../broadinstitute/sting/queue/QScript.scala | 25 +++++++++ .../sting/queue/util/ClassFieldCache.scala | 11 ++++ .../queue/util/RemoteFileConverter.scala | 21 ++++++++ 5 files changed, 100 insertions(+), 19 deletions(-) create mode 100644 public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala create mode 100644 public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 5b84bfd16..2afa66d9c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -24,7 +24,6 @@ package org.broadinstitute.sting.queue -import function.QFunction import java.io.File import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.util._ @@ -96,18 +95,18 @@ class QCommandLine extends CommandLineProgram with Logging { new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL)) } - private lazy val qStatusMessengerPluginManager = { - new PluginManager[QStatusMessenger](classOf[QStatusMessenger]) + private lazy val qCommandPlugin = { + new PluginManager[QCommandPlugin](classOf[QCommandPlugin]) } - ClassFieldCache.parsingEngine = new ParsingEngine(this) - /** * Takes the QScripts passed in, runs their script() methods, retrieves their generated * functions, and then builds and runs a QGraph based on the dependencies. */ def execute = { - val allStatusMessengers = qStatusMessengerPluginManager.createAllTypes() + ClassFieldCache.parsingEngine = this.parser + + val allCommandPlugins = qCommandPlugin.createAllTypes() if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) @@ -115,14 +114,24 @@ class QCommandLine extends CommandLineProgram with Logging { settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp") qGraph.initializeWithSettings(settings) - for (statusMessenger <- allStatusMessengers) { - loadArgumentsIntoObject(statusMessenger) + for (commandPlugin <- allCommandPlugins) { + loadArgumentsIntoObject(commandPlugin) } - for (statusMessenger <- allStatusMessengers) { - statusMessenger.started() + for (commandPlugin <- allCommandPlugins) { + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.started() } + // TODO: Default command plugin argument? + val remoteFileConverter = ( + for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null)) + yield commandPlugin.remoteFileConverter + ).headOption.getOrElse(null) + + if (remoteFileConverter != null) + loadArgumentsIntoObject(remoteFileConverter) + val allQScripts = qScriptPluginManager.createAllTypes() for (script <- allQScripts) { logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) @@ -137,10 +146,15 @@ class QCommandLine extends CommandLineProgram with Logging { case e: Exception => throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e) } + + if (remoteFileConverter != null) { + if (remoteFileConverter.convertToRemoteEnabled) + script.mkRemoteOutputs(remoteFileConverter) + } + script.functions.foreach(qGraph.add(_)) logger.info("Added " + script.functions.size + " functions") } - // Execute the job graph qGraph.run() @@ -162,14 +176,16 @@ class QCommandLine extends CommandLineProgram with Logging { if (!success) { logger.info("Done with errors") qGraph.logFailed() - for (statusMessenger <- allStatusMessengers) - statusMessenger.exit("Done with errors") + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.exit("Done with errors") 1 } else { if (settings.run) { allQScripts.foreach(_.pushOutputs()) - for (statusMessenger <- allStatusMessengers) - statusMessenger.done(allQScripts.map(_.remoteOutputs)) + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.done(allQScripts.map(_.remoteOutputs)) } 0 } @@ -189,7 +205,7 @@ class QCommandLine extends CommandLineProgram with Logging { override def getArgumentSources = { var plugins = Seq.empty[Class[_]] plugins ++= qScriptPluginManager.getPlugins - plugins ++= qStatusMessengerPluginManager.getPlugins + plugins ++= qCommandPlugin.getPlugins plugins.toArray } @@ -200,11 +216,10 @@ class QCommandLine extends CommandLineProgram with Logging { override def getArgumentSourceName(source: Class[_]) = { if (classOf[QScript].isAssignableFrom(source)) qScriptPluginManager.getName(source.asSubclass(classOf[QScript])) - else if (classOf[QStatusMessenger].isAssignableFrom(source)) - qStatusMessengerPluginManager.getName(source.asSubclass(classOf[QStatusMessenger])) + else if (classOf[QCommandPlugin].isAssignableFrom(source)) + qCommandPlugin.getName(source.asSubclass(classOf[QCommandPlugin])) else null - } /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala new file mode 100644 index 000000000..499c31554 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala @@ -0,0 +1,9 @@ +package org.broadinstitute.sting.queue + +import engine.QStatusMessenger +import util.RemoteFileConverter + +trait QCommandPlugin { + def statusMessenger: QStatusMessenger = null + def remoteFileConverter: RemoteFileConverter = null +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 2dcfb916c..3df61b1e3 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -108,6 +108,24 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon functions.foreach( f => add(f) ) } + /** + * Convert all @Output files to remote output files. + * @param remoteFileConverter Converter for files to remote files. + */ + def mkRemoteOutputs(remoteFileConverter: RemoteFileConverter) { + for (field <- outputFields) { + val fieldFile = ClassFieldCache.getFieldFile(this, field) + if (fieldFile != null && !fieldFile.isInstanceOf[RemoteFile]) { + val fieldName = ClassFieldCache.fullName(field) + val remoteFile = remoteFileConverter.convertToRemote(fieldFile, fieldName) + ClassFieldCache.setFieldValue(this, field, remoteFile) + } + } + } + + /** + * Pull all remote files to the local disk. + */ def pullInputs() { val inputs = ClassFieldCache.getFieldFiles(this, inputFields) for (remoteFile <- filterRemoteFiles(inputs)) { @@ -116,6 +134,9 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon } } + /** + * Push all remote files from the local disk. + */ def pushOutputs() { val outputs = ClassFieldCache.getFieldFiles(this, outputFields) for (remoteFile <- filterRemoteFiles(outputs)) { @@ -124,6 +145,10 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon } } + /** + * List out the remote outputs + * @return the RemoteFile outputs by argument source + */ def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = outputFields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala index 870dd5617..ae3db6860 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala @@ -180,4 +180,15 @@ object ClassFieldCache { case unknown => throw new QException("Non-file found. Try removing the annotation, change the annotation to @Argument, or extend File with FileExtension: %s: %s".format(field.field, unknown)) } + + // + // other utilities + // + + /** + * Retrieves the fullName of the argument + * @param field ArgumentSource to check + * @return Full name of the argument source + */ + def fullName(field: ArgumentSource) = field.createArgumentDefinitions().get(0).fullName } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala new file mode 100644 index 000000000..c77c242d0 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.queue.util + +import java.io.File + +trait RemoteFileConverter { + type RemoteFileType <: RemoteFile + + /** + * If this remote file creator is capable of converting to a remote file. + * @return true if ready to convert + */ + def convertToRemoteEnabled: Boolean + + /** + * Converts to a remote file + * @param file The original file + * @param name A "name" to use for the remote file + * @return The new version of this remote file. + */ + def convertToRemote(file: File, name: String): RemoteFileType +} From 7860ff7981510b67e6f4af4aeff73ef19ee2f245 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 22 Oct 2012 19:59:15 -0400 Subject: [PATCH 10/59] a) Resolve [#DEV-56] - test data with indels in new directory private/testdata/CMITestData/. b) Skeleton (not yet working) of fastq-BAM unit test, c) misc bug fixes for QC functions to work (not done yet) --- .../picard/CollectGcBiasMetrics.scala | 5 +- .../picard/CollectMultipleMetrics.scala | 4 +- .../picard/PicardMetricsFunction.scala | 53 +++++++++++++++++++ 3 files changed, 58 insertions(+), 4 deletions(-) create mode 100644 public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala index e783422b3..cba6ce2bb 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala @@ -11,8 +11,8 @@ import java.io.File * To change this template use File | Settings | File Templates. */ class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction { - analysisName = "CalculateGcMetrics" - javaMainClass = "net.sf.picard.analysis.GcBiasSummaryMetrics" + analysisName = "CollectGcBiasMetrics" + javaMainClass = "net.sf.picard.analysis.CollectGcBiasMetrics" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) var input: Seq[File] = Nil @@ -26,6 +26,7 @@ class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaC override def inputBams = input override def outputFile = output override def commandLine = super.commandLine + + required("SUMMARY_OUTPUT=" + output) + required("CHART_OUTPUT=" + output+".pdf") + required("REFERENCE_SEQUENCE=" + reference) + required("ASSUME_SORTED=true") diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala index eab6c79ce..b9af5258c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala @@ -11,8 +11,8 @@ import java.io.File * To change this template use File | Settings | File Templates. */ class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction{ - analysisName = "CalculateMultipleMetrics" - javaMainClass = "net.sf.picard.analysis.CalculateMultipleMetrics" + analysisName = "CollectMultipleMetrics" + javaMainClass = "net.sf.picard.analysis.CollectMultipleMetrics" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) var input: Seq[File] = Nil diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala new file mode 100644 index 000000000..89169e972 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.picard + +import java.io.File +import org.broadinstitute.sting.queue.function.JavaCommandLineFunction +import net.sf.samtools.SAMFileReader.ValidationStringency +import net.sf.samtools.SAMFileHeader.SortOrder + +/** + * Wraps a Picard function that operates on BAM files but doesn't output a new BAM file (i.e. QC metric files). + * See http://picard.sourceforge.net/ for more info. + * + * Since the various BAM utilities take slightly different arguments + * some values are optional. + */ +trait PicardMetricsFunction extends JavaCommandLineFunction { + var validationStringency = ValidationStringency.SILENT + var maxRecordsInRam: Option[Int] = None + var assumeSorted: Option[Boolean] = None + protected def inputBams: Seq[File] + protected def outputFile: File + + abstract override def commandLine = super.commandLine + + repeat("INPUT=", inputBams, spaceSeparated=false) + + required("TMP_DIR=" + jobTempDir) + + optional("VALIDATION_STRINGENCY=", validationStringency, spaceSeparated=false) + + optional("OUTPUT=", outputFile, spaceSeparated=false) + + optional("MAX_RECORDS_IN_RAM=", maxRecordsInRam, spaceSeparated=false) + + optional("ASSUME_SORTED=", assumeSorted, spaceSeparated=false) +} From bbf7a0fb091937691551a686171266852dffccca Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 22 Oct 2012 16:09:49 -0400 Subject: [PATCH 12/59] Adding integration test to ReduceReads coreduction DEV-117 #resolve --- .../reducereads/ReduceReadsIntegrationTest.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 89f251ed4..73b7025c3 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -14,6 +14,9 @@ public class ReduceReadsIntegrationTest extends WalkerTest { final String DIVIDEBYZERO_BAM = validationDataLocation + "ReduceReadsDivideByZeroBug.bam"; final String DIVIDEBYZERO_L = " -L " + validationDataLocation + "ReduceReadsDivideByZeroBug.intervals"; final String L = " -L 20:10,100,000-10,120,000 "; + final String COREDUCTION_BAM_A = validationDataLocation + "coreduction.test.A.bam"; + final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam"; + final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057"; private void RRTest(String testName, String args, String md5) { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s "; @@ -77,5 +80,11 @@ public class ReduceReadsIntegrationTest extends WalkerTest { executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("d8d066304f7c187f182bfb50f39baa0c"))); } + @Test(enabled = true) + public void testCoReduction() { + String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList(""))); + } + } From 4cd1a923587779b132a3e76835c7aa5cf79c0fe4 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 23 Oct 2012 11:26:26 -0400 Subject: [PATCH 13/59] Updating RR integration tests Forgot to update the integration tests after merging DEV-117 with optimizations from GATK main repo. --- .../reducereads/ReduceReadsIntegrationTest.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 73b7025c3..50500536f 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -26,28 +26,28 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "1f95f3193bd9f120a73c34a0087abaf6"); + RRTest("testDefaultCompression ", L, "46ea88e32bae3072f5cd68a0db4b55f1"); } @Test(enabled = true) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "79213d6ac68d56d4d72dcf511223e424"); + RRTest("testMultipleIntervals ", intervals, "c3784a0b42f5456b705f9b152a4b697a"); } @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "dab2aa8e3655139974bbe12a568363d9"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e385eb0ae5768f8507671d5303a212d5"); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63"); + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "6b5546be9363e493b9838542f5dc8cae"); } @Test(enabled = true) public void testIndelCompression() { - RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "1255245ed4ebeacda90f0dbb4e4da081"); + RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f6c9ea83608f35f113cf1f62a77ee6d0"); } @Test(enabled = true) @@ -67,7 +67,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("4b590269cbe3574dbdd5bdc2bc6f5f1c"))); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("647b0f0f95730de8e6bc4f74186ad4df"))); } /** @@ -77,7 +77,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; - executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("d8d066304f7c187f182bfb50f39baa0c"))); + executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("2c87985972dd43ee9dd50b463d93a511"))); } @Test(enabled = true) From 0cce1ae8b2fc568182befeefe3d00cdc92c434c2 Mon Sep 17 00:00:00 2001 From: kshakir Date: Tue, 23 Oct 2012 12:38:39 -0400 Subject: [PATCH 14/59] When gathering VCFs, using CombineVariants from the current classpath, and not the GATK used to run the command. This was a concern for external modules that bundled the engine but not CombineVariants. --- .../sting/queue/extensions/gatk/VcfGatherFunction.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index fb22554f0..3fb5101d0 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -39,7 +39,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMe private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] override def freezeFieldValues() { - this.jarFile = this.originalGATK.jarFile this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) } this.out = this.originalOutput GATKIntervals.copyIntervalArguments(this.originalGATK, this) From 5fac5bf12eb8c40af4809f0d4a31812a29d3b4a0 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 23 Oct 2012 14:08:31 -0400 Subject: [PATCH 16/59] Fixed issues with Queue packaging of Picard QC classes: separate jar's are needed fromPicard. User needs to specify the -picardBase argument to point to input path for jars. > Also, reenable joint cleaning as now it works. > DEV-125 #resolve > DEV-90 #resolve --- .../sting/queue/extensions/picard/CalculateHsMetrics.scala | 1 - .../sting/queue/extensions/picard/CollectGcBiasMetrics.scala | 1 - .../sting/queue/extensions/picard/CollectMultipleMetrics.scala | 1 - 3 files changed, 3 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala index aa36e29b6..3db498210 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala @@ -13,7 +13,6 @@ import net.sf.picard.analysis.MetricAccumulationLevel */ class CalculateHsMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction { analysisName = "CalculateHsMetrics" - javaMainClass = "net.sf.picard.analysis.directed.CalculateHsMetrics" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) var input: Seq[File] = Nil diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala index cba6ce2bb..fa655206d 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala @@ -12,7 +12,6 @@ import java.io.File */ class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction { analysisName = "CollectGcBiasMetrics" - javaMainClass = "net.sf.picard.analysis.CollectGcBiasMetrics" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) var input: Seq[File] = Nil diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala index b9af5258c..3695114c4 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala @@ -12,7 +12,6 @@ import java.io.File */ class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction{ analysisName = "CollectMultipleMetrics" - javaMainClass = "net.sf.picard.analysis.CollectMultipleMetrics" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) var input: Seq[File] = Nil From 8dfa24df7b9d163b2a5f46c3bd2d89173efa7b62 Mon Sep 17 00:00:00 2001 From: kshakir Date: Tue, 23 Oct 2012 12:34:26 -0400 Subject: [PATCH 17/59] Sending a version of per job status messages. In addition to outputs, inputs are passed to QStatusMessenger.done() CloneFunction.cloneIndex has a new CloneFunction.cloneCount companion useful for display purposes. --- .../sting/queue/QCommandLine.scala | 11 ++++-- .../broadinstitute/sting/queue/QScript.scala | 13 +++++-- .../sting/queue/engine/QGraph.scala | 34 +++++++++++++++---- .../sting/queue/engine/QStatusMessenger.scala | 6 +++- .../scattergather/CloneFunction.scala | 1 + .../ScatterGatherableFunction.scala | 1 + 6 files changed, 54 insertions(+), 12 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 2afa66d9c..65abaf7be 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -123,6 +123,8 @@ class QCommandLine extends CommandLineProgram with Logging { commandPlugin.statusMessenger.started() } + qGraph.messengers = allCommandPlugins.filter(_.statusMessenger != null).map(_.statusMessenger).toSeq + // TODO: Default command plugin argument? val remoteFileConverter = ( for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null)) @@ -178,14 +180,17 @@ class QCommandLine extends CommandLineProgram with Logging { qGraph.logFailed() for (commandPlugin <- allCommandPlugins) if (commandPlugin.statusMessenger != null) - commandPlugin.statusMessenger.exit("Done with errors") + commandPlugin.statusMessenger.exit("Done with errors: %s".format(qGraph.formattedStatusCounts)) 1 } else { if (settings.run) { allQScripts.foreach(_.pushOutputs()) for (commandPlugin <- allCommandPlugins) - if (commandPlugin.statusMessenger != null) - commandPlugin.statusMessenger.done(allQScripts.map(_.remoteOutputs)) + if (commandPlugin.statusMessenger != null) { + val allInputs = allQScripts.map(_.remoteInputs) + val allOutputs = allQScripts.map(_.remoteOutputs) + commandPlugin.statusMessenger.done(allInputs, allOutputs) + } } 0 } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 3df61b1e3..8c834696c 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -149,8 +149,17 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon * List out the remote outputs * @return the RemoteFile outputs by argument source */ - def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = - outputFields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap + def remoteInputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(inputFields) + + /** + * List out the remote outputs + * @return the RemoteFile outputs by argument source + */ + def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(outputFields) + + private def remoteFieldMap(fields: Seq[ArgumentSource]): Map[ArgumentSource, Seq[RemoteFile]] = { + fields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap + } private def filterRemoteFiles(fields: Seq[File]): Seq[RemoteFile] = fields.filter(field => field != null && field.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile]) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index 2c33596e1..4f7dd665d 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -47,6 +47,7 @@ import java.io.{OutputStreamWriter, File} */ class QGraph extends Logging { var settings: QGraphSettings = _ + var messengers: Seq[QStatusMessenger] = Nil private def dryRun = !settings.run private var numMissingValues = 0 @@ -95,7 +96,7 @@ class QGraph extends Logging { * The settings aren't necessarily available until after this QGraph object has been constructed, so * this function must be called once the QGraphSettings have been filled in. * - * @param settings + * @param settings QGraphSettings */ def initializeWithSettings(settings: QGraphSettings) { this.settings = settings @@ -430,6 +431,7 @@ class QGraph extends Logging { val edge = readyJobs.head edge.runner = newRunner(edge.function) edge.start() + messengers.foreach(_.started(jobShortName(edge.function))) startedJobs += edge readyJobs -= edge logNextStatusCounts = true @@ -465,8 +467,14 @@ class QGraph extends Logging { updateStatus() runningJobs.foreach(edge => edge.status match { - case RunnerStatus.DONE => doneJobs += edge - case RunnerStatus.FAILED => failedJobs += edge + case RunnerStatus.DONE => { + doneJobs += edge + messengers.foreach(_.done(jobShortName(edge.function))) + } + case RunnerStatus.FAILED => { + failedJobs += edge + messengers.foreach(_.exit(jobShortName(edge.function), edge.function.jobErrorLines.mkString("%n".format()))) + } case RunnerStatus.RUNNING => /* do nothing while still running */ }) @@ -493,7 +501,7 @@ class QGraph extends Logging { // incremental if ( logNextStatusCounts && INCREMENTAL_JOBS_REPORT ) { logger.info("Writing incremental jobs reports...") - writeJobsReport(false) + writeJobsReport(plot = false) } readyJobs ++= getReadyJobs @@ -516,9 +524,13 @@ class QGraph extends Logging { private def nextRunningCheck(lastRunningCheck: Long) = ((30 * 1000L) - (System.currentTimeMillis - lastRunningCheck)) + def formattedStatusCounts: String = { + "%d Pend, %d Run, %d Fail, %d Done".format( + statusCounts.pending, statusCounts.running, statusCounts.failed, statusCounts.done) + } + private def logStatusCounts() { - logger.info("%d Pend, %d Run, %d Fail, %d Done".format( - statusCounts.pending, statusCounts.running, statusCounts.failed, statusCounts.done)) + logger.info(formattedStatusCounts) } /** @@ -533,6 +545,16 @@ class QGraph extends Logging { traverseFunctions(edge => recheckDone(edge)) } + // TODO: Yet another field to add (with overloads) to QFunction? + private def jobShortName(function: QFunction): String = { + var name = function.analysisName + if (function.isInstanceOf[CloneFunction]) { + val cloneFunction = function.asInstanceOf[CloneFunction] + name += " %d of %d".format(cloneFunction.cloneIndex, cloneFunction.cloneCount) + } + name + } + /** * First pass that checks if an edge is done or if it's an intermediate edge if it can be skipped. * This function may modify the status of previous edges if it discovers that the edge passed in diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala index eeabe6d1d..c4151dafc 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala @@ -8,6 +8,10 @@ import org.broadinstitute.sting.queue.util.RemoteFile */ trait QStatusMessenger { def started() - def done(files: Seq[Map[ArgumentSource, Seq[RemoteFile]]]) + def done(inputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]], outputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]]) def exit(message: String) + + def started(job: String) + def done(job: String) + def exit(job: String, message: String) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala index 91cacbb71..861db3f80 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala @@ -38,6 +38,7 @@ object CloneFunction { class CloneFunction extends CommandLineFunction { var originalFunction: ScatterGatherableFunction = _ var cloneIndex: Int = _ + var cloneCount: Int = _ private var overriddenFields = Map.empty[ArgumentSource, Any] private var withScatterPartCount = 0 diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala index 5dd7d4c79..b00437f9f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala @@ -176,6 +176,7 @@ trait ScatterGatherableFunction extends CommandLineFunction { cloneFunction.originalFunction = this cloneFunction.analysisName = this.analysisName cloneFunction.cloneIndex = i + cloneFunction.cloneCount = numClones cloneFunction.commandDirectory = this.scatterGatherTempDir(dirFormat.format(i)) cloneFunction.jobOutputFile = if (IOUtils.isSpecialFile(this.jobOutputFile)) this.jobOutputFile else new File(this.jobOutputFile.getName) if (this.jobErrorFile != null) From 596c1723aeec43ebf2feb7ede387b1954910736c Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 25 Oct 2012 10:35:43 -0400 Subject: [PATCH 23/59] Hidden, unsupported ability of VariantEval to run AlleleCount stratification on sites-only VCFs. I'll expose it/add tests on it if people think this is generaly useful. User needs to specify total # of samples as command line argument since genotypes are not available. Also, fixes to large-scale validation script: lower -minIndelFrac threshold or else we'll kill most indels since default 0.25 is too high for pools, fix also VE stratifications and add one VE run where eval=1KG, comp=pool data and AC stratification based on 1KG annotation --- .../sting/gatk/walkers/varianteval/VariantEval.java | 12 ++++++++++++ .../varianteval/stratifications/AlleleCount.java | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index a73e125ad..201028d99 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -183,6 +183,10 @@ public class VariantEval extends RodWalker implements TreeRedu @Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false) private boolean keepSitesWithAC0 = false; + @Hidden + @Argument(fullName="numSamples", shortName="numSamples", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false) + private int numSamplesFromArgument = 0; + /** * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc. @@ -589,6 +593,14 @@ public class VariantEval extends RodWalker implements TreeRedu public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; } public Set getSampleNamesForEvaluation() { return sampleNamesForEvaluation; } + public int getNumberOfSamplesForEvaluation() { + if (sampleNamesForEvaluation!= null && !sampleNamesForEvaluation.isEmpty()) + return sampleNamesForEvaluation.size(); + else { + return numSamplesFromArgument; + } + + } public Set getSampleNamesForStratification() { return sampleNamesForStratification; } public List> getComps() { return comps; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index e6efd4482..7197fc14c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -29,7 +29,7 @@ public class AlleleCount extends VariantStratifier { // There are ploidy x n sample chromosomes // TODO -- generalize to handle multiple ploidy - nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy(); + nchrom = getVariantEvalWalker().getNumberOfSamplesForEvaluation() * getVariantEvalWalker().getSamplePloidy(); if ( nchrom < 2 ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample"); From c8e17a7adf410aa540f6e51091160f56bd4231e2 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 30 Oct 2012 13:57:54 -0400 Subject: [PATCH 30/59] totally experimental UG feature, to be removed --- .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 3c4a97ec1..d9b46ad36 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -190,6 +190,9 @@ public class UnifiedGenotyperEngine { final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); + else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) + results.add(generateEmptyContext(tracker, refContext, null, rawContext)); + } } } From f5697532d63841acfe24f3045e6dbc4612c95f0e Mon Sep 17 00:00:00 2001 From: kshakir Date: Wed, 31 Oct 2012 11:49:50 -0400 Subject: [PATCH 32/59] Added mvninstall.queue.all target which includes private, along with supporting sub-targets. --- build.xml | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/build.xml b/build.xml index c6b1afc56..18ab3e684 100644 --- a/build.xml +++ b/build.xml @@ -327,14 +327,18 @@ - + - + + + + + @@ -842,19 +846,23 @@ - + - + - + - + + + + + @@ -906,13 +914,15 @@ - + - + - + - + + + @@ -975,6 +985,8 @@ + + From 525cf331f4e6568479582c44f7c7b8912cdf7773 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 13 Nov 2012 11:52:47 -0500 Subject: [PATCH 37/59] Don't catch a User Error and re-throw as a Reviewed Exception. That makes Eric unhappy. --- .../sting/gatk/io/storage/VariantContextWriterStorage.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java index 31f6d5954..8e4633869 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java @@ -198,8 +198,6 @@ public class VariantContextWriterStorage implements Storage Date: Tue, 13 Nov 2012 11:53:12 -0500 Subject: [PATCH 38/59] Having a malformed GATK report is a User Error --- .../broadinstitute/sting/gatk/report/GATKReportVersion.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java index b5a5e0443..b51fb17f0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.report; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; public enum GATKReportVersion { /** @@ -91,6 +92,6 @@ public enum GATKReportVersion { if (header.startsWith("#:GATKReport.v1.1")) return GATKReportVersion.V1_1; - throw new ReviewedStingException("Unknown GATK report version in header: " + header); + throw new UserException.BadInput("The GATK report has an unknown/unsupported version in the header: " + header); } } From ba41f65759724b650d8996185bc4761f50bda466 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 13 Nov 2012 11:53:39 -0500 Subject: [PATCH 39/59] Protect against NPEs in SelectVariants by checking for missing Genotypes --- .../sting/gatk/walkers/variantutils/SelectVariants.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index d1b7cb96f..d28fe34d6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -659,7 +659,10 @@ public class SelectVariants extends RodWalker implements TreeR return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED))); } - private boolean haveSameGenotypes(Genotype g1, Genotype g2) { + private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) { + if ( g1 == null || g2 == null ) + return false; + if ((g1.isCalled() && g2.isFiltered()) || (g2.isCalled() && g1.isFiltered()) || (g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED)) From 6d59dd34558fb6cc564eabab9b6c5a278a341a26 Mon Sep 17 00:00:00 2001 From: kshakir Date: Sun, 4 Nov 2012 23:42:02 -0500 Subject: [PATCH 40/59] Scala classes were only returning direct subclasses (confirmed when inspected in debugger) so changed PluginManager to allow specifying the explicit subclass. Removed some generics from PluginManager for now until able to figure out syntax for requesting explicit subclass. QStatusMessenger uses a slightly more primitive Map[String, Seq[RemoteFile]] instead of Map[ArgumentSource, Seq[RemoteFile]]. Added an QCommandPlugin.initScript utility method for handling specialized script types. --- .../org/broadinstitute/sting/gatk/WalkerManager.java | 4 ++-- .../sting/utils/classloader/PluginManager.java | 11 ++++++----- .../org/broadinstitute/sting/queue/QCommandLine.scala | 11 ++++++++--- .../broadinstitute/sting/queue/QCommandPlugin.scala | 2 ++ .../src/org/broadinstitute/sting/queue/QScript.scala | 8 ++++++-- .../sting/queue/engine/QStatusMessenger.scala | 3 +-- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index fbacbddc4..28b5f918d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -350,11 +350,11 @@ public class WalkerManager extends PluginManager { * @return A name for this type of walker. */ @Override - public String getName(Class walkerType) { + public String getName(Class walkerType) { String walkerName = ""; if (walkerType.getAnnotation(WalkerName.class) != null) - walkerName = walkerType.getAnnotation(WalkerName.class).value().trim(); + walkerName = ((WalkerName)walkerType.getAnnotation(WalkerName.class)).value().trim(); else walkerName = super.getName(walkerType); diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 43cc800d8..b39aae8ab 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -101,7 +101,7 @@ public class PluginManager { * Create a new plugin manager. * @param pluginType Core type for a plugin. */ - public PluginManager(Class pluginType) { + public PluginManager(Class pluginType) { this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), null); } @@ -110,7 +110,7 @@ public class PluginManager { * @param pluginType Core type for a plugin. * @param classpath Custom class path to search for classes. */ - public PluginManager(Class pluginType, List classpath) { + public PluginManager(Class pluginType, List classpath) { this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), classpath); } @@ -120,7 +120,7 @@ public class PluginManager { * @param pluginCategory Provides a category name to the plugin. Must not be null. * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. */ - public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) { + public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) { this(pluginType, pluginCategory, pluginSuffix, null); } @@ -131,7 +131,7 @@ public class PluginManager { * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. * @param classpath Custom class path to search for classes. */ - public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List classpath) { + public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List classpath) { this.pluginCategory = pluginCategory; this.pluginSuffix = pluginSuffix; @@ -149,6 +149,7 @@ public class PluginManager { } // Load all classes types filtering them by concrete. + @SuppressWarnings("unchecked") Set> allTypes = reflections.getSubTypesOf(pluginType); for( Class type: allTypes ) { // The plugin manager does not support anonymous classes; to be a plugin, a class must have a name. @@ -325,7 +326,7 @@ public class PluginManager { * @param pluginType The type of plugin. * @return A name for this type of plugin. */ - public String getName(Class pluginType) { + public String getName(Class pluginType) { String pluginName = ""; if (pluginName.length() == 0) { diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 65abaf7be..637174557 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -92,13 +92,19 @@ class QCommandLine extends CommandLineProgram with Logging { private lazy val qScriptPluginManager = { qScriptClasses = IOUtils.tempDir("Q-Classes-", "", settings.qSettings.tempDirectory) qScriptManager.loadScripts(scripts, qScriptClasses) - new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL)) + new PluginManager[QScript](qPluginType, Seq(qScriptClasses.toURI.toURL)) } private lazy val qCommandPlugin = { new PluginManager[QCommandPlugin](classOf[QCommandPlugin]) } + private lazy val allCommandPlugins = qCommandPlugin.createAllTypes() + + private lazy val qPluginType: Class[_ <: QScript] = { + allCommandPlugins.map(_.qScriptClass).headOption.getOrElse(classOf[QScript]) + } + /** * Takes the QScripts passed in, runs their script() methods, retrieves their generated * functions, and then builds and runs a QGraph based on the dependencies. @@ -106,8 +112,6 @@ class QCommandLine extends CommandLineProgram with Logging { def execute = { ClassFieldCache.parsingEngine = this.parser - val allCommandPlugins = qCommandPlugin.createAllTypes() - if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory)) @@ -138,6 +142,7 @@ class QCommandLine extends CommandLineProgram with Logging { for (script <- allQScripts) { logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) loadArgumentsIntoObject(script) + allCommandPlugins.foreach(_.initScript(script)) // TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now. //if (settings.run) script.pullInputs() diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala index 499c31554..eae6a6a92 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala @@ -6,4 +6,6 @@ import util.RemoteFileConverter trait QCommandPlugin { def statusMessenger: QStatusMessenger = null def remoteFileConverter: RemoteFileConverter = null + def qScriptClass: Class[_ <: QScript] = classOf[QScript] + def initScript(script: QScript) {} } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 8c834696c..eb8be183a 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -149,13 +149,17 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon * List out the remote outputs * @return the RemoteFile outputs by argument source */ - def remoteInputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(inputFields) + def remoteInputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(inputFields)) /** * List out the remote outputs * @return the RemoteFile outputs by argument source */ - def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(outputFields) + def remoteOutputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(outputFields)) + + private def tagMap(remoteFieldMap: Map[ArgumentSource, Seq[RemoteFile]]): Map[String, Seq[RemoteFile]] = { + remoteFieldMap.collect{ case (k, v) => ClassFieldCache.fullName(k) -> v }.toMap + } private def remoteFieldMap(fields: Seq[ArgumentSource]): Map[ArgumentSource, Seq[RemoteFile]] = { fields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala index c4151dafc..a1133b944 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala @@ -1,6 +1,5 @@ package org.broadinstitute.sting.queue.engine -import org.broadinstitute.sting.commandline.ArgumentSource import org.broadinstitute.sting.queue.util.RemoteFile /** @@ -8,7 +7,7 @@ import org.broadinstitute.sting.queue.util.RemoteFile */ trait QStatusMessenger { def started() - def done(inputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]], outputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]]) + def done(inputs: Seq[Map[String, Seq[RemoteFile]]], outputs: Seq[Map[String, Seq[RemoteFile]]]) def exit(message: String) def started(job: String) From a17cd54b6883427454c6ae0bb379fee9ef7f460b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 24 Oct 2012 16:57:08 -0400 Subject: [PATCH 44/59] Co-Reduction implementation in ReduceReads ReduceReads now co-reduces bams if they're passed in toghether with multiple -I. Co-reduction forces every variant region in one sample to be a variant region in all samples. Also: * Added integrationtest for co-reduction * Fixed bug with new no-recalculation implementation of the marksites object where the last object wasn't being removed after finalizing a variant region (updated MD5's accordingly) DEV-200 #resolve #time 8m --- .../reducereads/CompressionStash.java | 38 ++++++++ .../reducereads/MultiSampleCompressor.java | 49 ++++++---- .../compression/reducereads/ReduceReads.java | 2 +- .../reducereads/SingleSampleCompressor.java | 38 ++++---- .../reducereads/SlidingWindow.java | 89 ++++++++++--------- .../ReduceReadsIntegrationTest.java | 10 +-- .../reducereads/SimpleGenomeLoc.java | 43 +++++++++ 7 files changed, 185 insertions(+), 84 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java index 714a4df18..a6e5b6c5b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.utils.GenomeLocComparator; +import java.util.Collection; import java.util.TreeSet; /** @@ -18,4 +19,41 @@ public class CompressionStash extends TreeSet { public CompressionStash() { super(new GenomeLocComparator()); } + + /** + * Adds a SimpleGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc + * in the stash. + * + * @param insertLoc the new loc to be inserted + * @return true if the loc, or it's merged version, wasn't present in the list before. + */ + @Override + public boolean add(SimpleGenomeLoc insertLoc) { + TreeSet removedLocs = new TreeSet(); + for (SimpleGenomeLoc existingLoc : this) { + if (existingLoc.isPast(insertLoc)) { + break; // if we're past the loc we're done looking for overlaps. + } + if (existingLoc.equals(insertLoc)) { + return false; // if this loc was already present in the stash, we don't need to insert it. + } + if (existingLoc.contiguousP(insertLoc)) { + removedLocs.add(existingLoc); // list the original loc for merging + } + } + for (SimpleGenomeLoc loc : removedLocs) { + this.remove(loc); // remove all locs that will be merged + } + removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged + return super.add(SimpleGenomeLoc.merge(removedLocs)); // merge them all into one loc and add to the stash + } + + @Override + public boolean addAll(Collection locs) { + boolean result = false; + for (SimpleGenomeLoc loc : locs) { + result |= this.add(loc); + } + return result; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 2c3439010..f348225ca 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -3,13 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.HashMap; import java.util.Map; -import java.util.SortedSet; +import java.util.Set; import java.util.TreeSet; /* @@ -41,7 +42,7 @@ import java.util.TreeSet; * * @author depristo */ -public class MultiSampleCompressor implements Compressor { +public class MultiSampleCompressor { protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class); protected Map compressorsPerSample = new HashMap(); @@ -55,30 +56,44 @@ public class MultiSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction, - final CompressionStash compressionStash) { + final boolean allowPolyploidReduction) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction, compressionStash)); + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction)); } } - @Override - public Iterable addAlignment(GATKSAMRecord read) { - String sample = read.getReadGroup().getSample(); - SingleSampleCompressor compressor = compressorsPerSample.get(sample); + public Set addAlignment(GATKSAMRecord read) { + String sampleName = read.getReadGroup().getSample(); + SingleSampleCompressor compressor = compressorsPerSample.get(sampleName); if ( compressor == null ) - throw new ReviewedStingException("No compressor for sample " + sample); - return compressor.addAlignment(read); + throw new ReviewedStingException("No compressor for sample " + sampleName); + Pair, CompressionStash> readsAndStash = compressor.addAlignment(read); + Set reads = readsAndStash.getFirst(); + CompressionStash regions = readsAndStash.getSecond(); + + reads.addAll(closeVariantRegionsInAllSamples(regions)); + + return reads; } - @Override - public Iterable close() { - SortedSet reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); - for ( SingleSampleCompressor comp : compressorsPerSample.values() ) - for ( GATKSAMRecord read : comp.close() ) - reads.add(read); + public Set close() { + Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + for ( SingleSampleCompressor sample : compressorsPerSample.values() ) { + Pair, CompressionStash> readsAndStash = sample.close(); + reads = readsAndStash.getFirst(); + } + return reads; + } + + private Set closeVariantRegionsInAllSamples(CompressionStash regions) { + Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + if (!regions.isEmpty()) { + for (SingleSampleCompressor sample : compressorsPerSample.values()) { + reads.addAll(sample.closeVariantRegions(regions)); + } + } return reads; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index b6761f4a6..a05992cb4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -330,7 +330,7 @@ public class ReduceReads extends ReadWalker, ReduceRea */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION, compressionStash)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION)); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 82a433300..ac3388795 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -1,8 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.Set; import java.util.TreeSet; /** @@ -10,7 +12,7 @@ import java.util.TreeSet; * @author carneiro, depristo * @version 3.0 */ -public class SingleSampleCompressor implements Compressor { +public class SingleSampleCompressor { final private int contextSize; final private int downsampleCoverage; final private int minMappingQuality; @@ -20,11 +22,11 @@ public class SingleSampleCompressor implements Compressor { final private ReduceReads.DownsampleStrategy downsampleStrategy; final private int nContigs; final private boolean allowPolyploidReduction; - final CompressionStash compressionStash; private SlidingWindow slidingWindow; private int slidingWindowCounter; + public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new TreeSet(), new CompressionStash()); public SingleSampleCompressor(final int contextSize, final int downsampleCoverage, @@ -34,8 +36,7 @@ public class SingleSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction, - final CompressionStash compressionStash) { + final boolean allowPolyploidReduction) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; @@ -46,15 +47,11 @@ public class SingleSampleCompressor implements Compressor { this.downsampleStrategy = downsampleStrategy; this.nContigs = nContigs; this.allowPolyploidReduction = allowPolyploidReduction; - this.compressionStash = compressionStash; } - /** - * @{inheritDoc} - */ - @Override - public Iterable addAlignment( GATKSAMRecord read ) { - TreeSet result = new TreeSet(new AlignmentStartWithNoTiesComparator()); + public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) { + Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + CompressionStash stash = new CompressionStash(); int readOriginalStart = read.getUnclippedStart(); // create a new window if: @@ -63,22 +60,27 @@ public class SingleSampleCompressor implements Compressor { (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window // close the current sliding window - result.addAll(slidingWindow.close()); + Pair, CompressionStash> readsAndStash = slidingWindow.close(); + reads = readsAndStash.getFirst(); + stash = readsAndStash.getSecond(); slidingWindow = null; // so we create a new one on the next if } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction, compressionStash); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction); slidingWindowCounter++; } - result.addAll(slidingWindow.addRead(read)); - return result; + stash.addAll(slidingWindow.addRead(read)); + return new Pair, CompressionStash>(reads, stash); } - @Override - public Iterable close() { - return (slidingWindow != null) ? slidingWindow.close() : new TreeSet(); + public Pair, CompressionStash> close() { + return (slidingWindow != null) ? slidingWindow.close() : emptyPair; + } + + public Set closeVariantRegions(CompressionStash regions) { + return slidingWindow.closeVariantRegions(regions); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 24cacd997..24a3ba3cb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -6,8 +6,10 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -55,7 +57,8 @@ public class SlidingWindow { private final int nContigs; private boolean allowPolyploidReductionInGeneral; - private CompressionStash compressionStash; + + private static CompressionStash emptyRegions = new CompressionStash(); /** * The types of synthetic reads to use in the finalizeAndAdd method @@ -87,7 +90,7 @@ public class SlidingWindow { } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction, CompressionStash compressionStash) { + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; @@ -124,7 +127,6 @@ public class SlidingWindow { this.nContigs = nContigs; this.allowPolyploidReductionInGeneral = allowPolyploidReduction; - this.compressionStash = compressionStash; } /** @@ -138,7 +140,7 @@ public class SlidingWindow { * @param read the read * @return a list of reads that have been finished by sliding the window. */ - public List addRead(GATKSAMRecord read) { + public CompressionStash addRead(GATKSAMRecord read) { addToHeader(windowHeader, read); // update the window header counts readsInWindow.add(read); // add read to sliding reads return slideWindow(read.getUnclippedStart()); @@ -152,8 +154,9 @@ public class SlidingWindow { * @param variantSite boolean array with true marking variant regions * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. */ - private SimpleGenomeLoc getNextVariantRegion(int from, int to, boolean[] variantSite) { + private SimpleGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean forceClose) { boolean foundStart = false; + final int windowHeaderStart = getStartLocation(windowHeader); int variantRegionStartIndex = 0; for (int i=from; i slideWindow(final int incomingReadUnclippedStart) { - List finalizedReads = new LinkedList(); - + protected CompressionStash slideWindow(final int incomingReadUnclippedStart) { final int windowHeaderStartLocation = getStartLocation(windowHeader); + CompressionStash regions = emptyRegions; + boolean forceClose = true; if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) { markSites(incomingReadUnclippedStart); int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation; int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) - CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, markedSites.getVariantSiteBitSet()); - finalizedReads = closeVariantRegions(regions, false); - - while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { - readsInWindow.pollFirst(); - } + regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); } - return finalizedReads; + // todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions + while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { + readsInWindow.pollFirst(); + } + + return regions; } @@ -623,31 +629,27 @@ public class SlidingWindow { result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); - return result; // finalized reads will be downsampled if necessary + return result; // finalized reads will be downsampled if necessary } - - private List closeVariantRegions(CompressionStash regions, boolean forceClose) { - List allReads = new LinkedList(); + public Set closeVariantRegions(CompressionStash regions) { + TreeSet allReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); if (!regions.isEmpty()) { int lastStop = -1; + int windowHeaderStart = getStartLocation(windowHeader); + for (SimpleGenomeLoc region : regions) { - int start = region.getStart(); - int stop = region.getStop(); + if (region.isFinished() && region.getContig() == contig && region.getStart() >= windowHeaderStart && region.getStop() <= windowHeaderStart + windowHeader.size()) { + int start = region.getStart() - windowHeaderStart; + int stop = region.getStop() - windowHeaderStart; - if (!region.isFinished()) { - if(forceClose) // region is unfinished but we're forcing the close of this window - stop = windowHeader.size() - 1; - else - continue; // region is unfinished and we're not forcing the close of this window + allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track + lastStop = stop; } - - allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); - lastStop = stop; } - for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) - windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! + for (int i = 0; i <= lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) + windowHeader.remove(); } return allReads; } @@ -681,23 +683,24 @@ public class SlidingWindow { * * @return All reads generated */ - public List close() { + public Pair, CompressionStash> close() { // mark variant regions - List finalizedReads = new LinkedList(); + Set finalizedReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + CompressionStash regions = new CompressionStash(); + boolean forceCloseUnfinishedRegions = true; if (!windowHeader.isEmpty()) { markSites(getStopLocation(windowHeader) + 1); - CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), markedSites.getVariantSiteBitSet()); - finalizedReads = closeVariantRegions(regions, true); + regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions); + finalizedReads = closeVariantRegions(regions); if (!windowHeader.isEmpty()) { finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false)); finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up } - } - return finalizedReads; + return new Pair, CompressionStash>(finalizedReads, regions); } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 50500536f..1e539dc9d 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -26,23 +26,23 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "46ea88e32bae3072f5cd68a0db4b55f1"); + RRTest("testDefaultCompression ", L, "98080d3c53f441564796fc143cf510da"); } @Test(enabled = true) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "c3784a0b42f5456b705f9b152a4b697a"); + RRTest("testMultipleIntervals ", intervals, "c5dcdf4edf368b5b897d66f76034d9f0"); } @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e385eb0ae5768f8507671d5303a212d5"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "27cb99e87eda5e46187e56f50dd37f26"); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "6b5546be9363e493b9838542f5dc8cae"); + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "4e7f111688d49973c35669855b7a2eaf"); } @Test(enabled = true) @@ -83,7 +83,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList(""))); + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("5c30fde961a1357bf72c15144c01981b"))); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java index 45e105751..51d8aad63 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java @@ -1,6 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.SortedSet; /** * GenomeLocs are very useful objects to keep track of genomic locations and perform set operations @@ -27,4 +31,43 @@ public class SimpleGenomeLoc extends GenomeLoc { public boolean isFinished() { return finished; } + + @Requires("a != null && b != null") + public static SimpleGenomeLoc merge(SimpleGenomeLoc a, SimpleGenomeLoc b) throws ReviewedStingException { + if(GenomeLoc.isUnmapped(a) || GenomeLoc.isUnmapped(b)) { + throw new ReviewedStingException("Tried to merge unmapped genome locs"); + } + + if (!(a.contiguousP(b))) { + throw new ReviewedStingException("The two genome locs need to be contiguous"); + } + + + return new SimpleGenomeLoc(a.getContig(), a.contigIndex, + Math.min(a.getStart(), b.getStart()), + Math.max(a.getStop(), b.getStop()), + a.isFinished()); + } + + /** + * Merges a list of *sorted* *contiguous* locs into one + * + * @param sortedLocs a sorted list of contiguous locs + * @return one merged loc + */ + public static SimpleGenomeLoc merge(SortedSet sortedLocs) { + SimpleGenomeLoc previousLoc = null; + for (SimpleGenomeLoc loc : sortedLocs) { + if (loc.isUnmapped()) { + throw new ReviewedStingException("Tried to merge unmapped genome locs"); + } + if (previousLoc != null && !previousLoc.contiguousP(loc)) { + throw new ReviewedStingException("The genome locs need to be contiguous"); + } + previousLoc = loc; + } + SimpleGenomeLoc firstLoc = sortedLocs.first(); + SimpleGenomeLoc lastLoc = sortedLocs.last(); + return merge(firstLoc, lastLoc); + } } From dba31018f46052984ab3b779d18950e5d1aee501 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 13 Nov 2012 01:18:37 -0500 Subject: [PATCH 48/59] Implementation of BySampleSAMFileWriter ReduceReads now works with the n-way-out capability, splitting by sample. DEV-27 #resolve #time 3m --- .../compression/reducereads/ReduceReads.java | 37 +- .../utils/sam/BySampleSAMFileWriter.java | 70 ++++ .../sting/utils/sam/NWaySAMFileWriter.java | 374 +++++++++--------- 3 files changed, 290 insertions(+), 191 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index a05992cb4..3712e4524 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -25,6 +25,9 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMProgramRecord; import net.sf.samtools.util.SequenceUtil; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; @@ -45,6 +48,7 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -86,7 +90,8 @@ import java.util.*; public class ReduceReads extends ReadWalker, ReduceReadsStash> { @Output - private StingSAMFileWriter out; + private StingSAMFileWriter out = null; + private SAMFileWriter writerToUse = null; /** * The number of bases to keep around mismatches (potential variation) @@ -196,6 +201,10 @@ public class ReduceReads extends ReadWalker, ReduceRea @Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false) private int nContigs = 2; + @Hidden + @Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false) + private boolean nwayout = false; + @Hidden @Argument(fullName = "", shortName = "dl", doc = "", required = false) private int debugLevel = 0; @@ -227,6 +236,7 @@ public class ReduceReads extends ReadWalker, ReduceRea SortedSet intervalList; private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag + private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam"; /** * Basic generic initialization of the readNameHash and the intervalList. Output initialization @@ -242,10 +252,24 @@ public class ReduceReads extends ReadWalker, ReduceRea if (toolkit.getIntervals() != null) intervalList.addAll(toolkit.getIntervals()); - if (!NO_PG_TAG) - Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME); - else + + // todo -- rework the whole NO_PG_TAG thing + final boolean preSorted = true; + final boolean indexOnTheFly = true; + final boolean generateMD5 = true; + final boolean keep_records = true; + final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; + if (nwayout) { + SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME); + writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, preSorted, indexOnTheFly, NO_PG_TAG, programRecord, true); + } + else { + writerToUse = out; out.setPresorted(false); + if (!NO_PG_TAG) { + Utils.setupWriter(out, toolkit, !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + } + } } /** @@ -386,6 +410,9 @@ public class ReduceReads extends ReadWalker, ReduceRea // output any remaining reads in the compressor for (GATKSAMRecord read : stash.close()) outputRead(read); + + if (nwayout) + writerToUse.close(); } /** @@ -554,7 +581,7 @@ public class ReduceReads extends ReadWalker, ReduceRea if (!DONT_COMPRESS_READ_NAMES) compressReadName(read); - out.addAlignment(read); + writerToUse.addAlignment(read); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java new file mode 100644 index 000000000..6bad58d9f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.HashMap; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: Nov 13 + */ +public class BySampleSAMFileWriter extends NWaySAMFileWriter{ + + private final Map sampleToWriterMap; + + public BySampleSAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + super(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, pRecord, keep_records); + + sampleToWriterMap = new HashMap(toolkit.getSAMFileHeader().getReadGroups().size() * 2); + + for (SAMReaderID readerID : toolkit.getReadsDataSource().getReaderIDs()) { + for (SAMReadGroupRecord rg : toolkit.getReadsDataSource().getHeader(readerID).getReadGroups()) { + String sample = rg.getSample(); + if (sampleToWriterMap.containsKey(sample) && sampleToWriterMap.get(sample) != readerID) { + throw new ReviewedStingException("The same sample appears in multiple files, this input cannot be multiplexed using the BySampleSAMFileWriter, try NWaySAMFileWriter instead."); + } + else { + sampleToWriterMap.put(sample, readerID); + } + } + } + } + + @Override + public void addAlignment(SAMRecord samRecord) { + super.addAlignment(samRecord, sampleToWriterMap.get(samRecord.getReadGroup().getSample())); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index fa07523f3..83d1c99bf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -1,186 +1,188 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; - -import java.io.File; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: May 31, 2011 - * Time: 3:52:49 PM - * To change this template use File | Settings | File Templates. - */ -public class NWaySAMFileWriter implements SAMFileWriter { - - private Map writerMap = null; - private boolean presorted ; - GenomeAnalysisEngine toolkit; - boolean KEEP_ALL_PG_RECORDS = false; - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { - this.presorted = presorted; - this.toolkit = toolkit; - this.KEEP_ALL_PG_RECORDS = keep_records; - writerMap = new HashMap(); - setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { - this.presorted = presorted; - this.toolkit = toolkit; - this.KEEP_ALL_PG_RECORDS = keep_records; - writerMap = new HashMap(); - setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5) { - this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly , boolean generateMD5) { - this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); - } - - /** - * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved - * from toolkit). The in2out map must contain an entry for each input filename and map it - * onto a unique output file name. - * @param toolkit - * @param in2out - */ - public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { - if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); - for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { - - String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); - - String outName; - if ( ! in2out.containsKey(fName) ) - throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName); - outName = in2out.get(fName); - - if ( writerMap.containsKey( rid ) ) - throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ - "map file likely contains multiple entries for this input file"); - - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - } - - /** - * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved - * from toolkit). The output file names will be generated automatically by stripping ".sam" or ".bam" off the - * input file name and adding ext instead (e.g. ".cleaned.bam"). - * onto a unique output file name. - * @param toolkit - * @param ext - */ - public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { - for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { - - String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); - - String outName; - int pos ; - if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM"); - else { - if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM"); - else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam"); - } - String prefix = fName.substring(0,pos); - outName = prefix+ext; - - if ( writerMap.containsKey( rid ) ) - throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - } - - private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, - boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { - File f = new File(outName); - SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); - header.setSortOrder(order); - - if ( programRecord != null ) { - // --->> add program record - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) - newRecords.add(record); - } - newRecords.add(programRecord); - header.setProgramRecords(newRecords); - // <-- add program record ends here - } - SAMFileWriterFactory factory = new SAMFileWriterFactory(); - factory.setCreateIndex(indexOnTheFly); - factory.setCreateMd5File(generateMD5); - SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); - writerMap.put(id,sw); - } - - public Collection getWriters() { - return writerMap.values(); - } - - public void addAlignment(SAMRecord samRecord) { - final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); - String rg = samRecord.getStringAttribute("RG"); - if ( rg != null ) { - String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); - samRecord.setAttribute("RG",rg_orig); - } - writerMap.get(id).addAlignment(samRecord); - } - - public SAMFileHeader getFileHeader() { - return toolkit.getSAMFileHeader(); - } - - public void close() { - for ( SAMFileWriter w : writerMap.values() ) w.close(); - } -} +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: May 31, 2011 + * Time: 3:52:49 PM + * To change this template use File | Settings | File Templates. + */ +public class NWaySAMFileWriter implements SAMFileWriter { + + private Map writerMap = null; + private boolean presorted ; + GenomeAnalysisEngine toolkit; + boolean KEEP_ALL_PG_RECORDS = false; + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5) { + this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The in2out map must contain an entry for each input filename and map it + * onto a unique output file name. + * @param toolkit + * @param in2out + */ + public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + if ( ! in2out.containsKey(fName) ) + throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName); + outName = in2out.get(fName); + + if ( writerMap.containsKey( rid ) ) + throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ + "map file likely contains multiple entries for this input file"); + + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The output file names will be generated automatically by stripping ".sam" or ".bam" off the + * input file name and adding ext instead (e.g. ".cleaned.bam"). + * onto a unique output file name. + * @param toolkit + * @param ext + */ + public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + int pos ; + if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM"); + else { + if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM"); + else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam"); + } + String prefix = fName.substring(0,pos); + outName = prefix+ext; + + if ( writerMap.containsKey( rid ) ) + throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, + boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { + File f = new File(outName); + SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); + header.setSortOrder(order); + + if ( programRecord != null ) { + // --->> add program record + List oldRecords = header.getProgramRecords(); + List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) { + if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + newRecords.add(record); + } + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + // <-- add program record ends here + } + SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(indexOnTheFly); + factory.setCreateMd5File(generateMD5); + SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); + writerMap.put(id,sw); + } + + public Collection getWriters() { + return writerMap.values(); + } + + public void addAlignment(SAMRecord samRecord) { + final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); + String rg = samRecord.getStringAttribute("RG"); + if ( rg != null ) { + String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); + samRecord.setAttribute("RG",rg_orig); + } + addAlignment(samRecord, id); + } + + public void addAlignment(SAMRecord samRecord, SAMReaderID readerID) { + writerMap.get(readerID).addAlignment(samRecord); + } + + public SAMFileHeader getFileHeader() { + return toolkit.getSAMFileHeader(); + } + + public void close() { + for ( SAMFileWriter w : writerMap.values() ) w.close(); + } +} From a079d8d0d106660ca5bdd4d2c93d01f4fef141a9 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 13 Nov 2012 15:21:57 -0500 Subject: [PATCH 49/59] Breaking the utility to write @PG tags for SAMFileWriters and StingSAMFileWriters --- .../compression/reducereads/ReduceReads.java | 3 +- .../org/broadinstitute/sting/utils/Utils.java | 59 +++++++++++++++++-- .../sting/utils/sam/NWaySAMFileWriter.java | 21 ++----- 3 files changed, 59 insertions(+), 24 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 3712e4524..3cdf3d75e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -256,7 +256,6 @@ public class ReduceReads extends ReadWalker, ReduceRea // todo -- rework the whole NO_PG_TAG thing final boolean preSorted = true; final boolean indexOnTheFly = true; - final boolean generateMD5 = true; final boolean keep_records = true; final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; if (nwayout) { @@ -267,7 +266,7 @@ public class ReduceReads extends ReadWalker, ReduceRea writerToUse = out; out.setPresorted(false); if (!NO_PG_TAG) { - Utils.setupWriter(out, toolkit, !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index f4a200af0..b780d0966 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -687,23 +687,69 @@ public class Utils { array[i] = value; } - public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { - final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); - - SAMFileHeader header = toolkit.getSAMFileHeader(); + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param toolkit the engine + * @param originalHeader original header + * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file + * @param programRecord the program record for this program + */ + public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, SAMProgramRecord programRecord) { + SAMFileHeader header = originalHeader.clone(); List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS ) + if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); newRecords.add(programRecord); header.setProgramRecords(newRecords); + return header; + } + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and returns + * the new header to be added to the BAM writer. + * + * @param toolkit the engine + * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a pre-filled header for the bam writer + */ + public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); + return setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, programRecord); + } + + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param writer BAM file writer + * @param toolkit the engine + * @param preSorted whether or not the writer can assume reads are going to be added are already sorted + * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + */ + public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + SAMFileHeader header = setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, walker, PROGRAM_RECORD_NAME); writer.writeHeader(header); writer.setPresorted(preSorted); } - + + + /** + * Creates a program record (@PG) tag + * + * @param toolkit the engine + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a program record for the tool + */ public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) { final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); @@ -858,4 +904,5 @@ public class Utils { } return subLists; } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index 83d1c99bf..cdf70884c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -28,11 +28,14 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; /** * Created by IntelliJ IDEA. @@ -138,21 +141,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { File f = new File(outName); - SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); - header.setSortOrder(order); - - if ( programRecord != null ) { - // --->> add program record - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) - newRecords.add(record); - } - newRecords.add(programRecord); - header.setProgramRecords(newRecords); - // <-- add program record ends here - } + SAMFileHeader header = Utils.setupWriter(toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord); SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(indexOnTheFly); factory.setCreateMd5File(generateMD5); From 843384e43539eb9d41f5449cb2408fe3dd52cbb9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Nov 2012 11:47:09 -0500 Subject: [PATCH 50/59] Rename hg19 files in bundle to b37 since that's what they are --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 24ab50451..dc6cae197 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -147,13 +147,13 @@ class GATKResourcesBundle extends QScript { // // example call set for wiki tutorial // - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf", + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf", "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) // // Test BAM file, specific to each reference // - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam", + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam", "IGNORE", b37, false, false)) // From 89bbe73a43c6e6ebb30ee564f7b04a432139f716 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 14 Nov 2012 14:39:04 -0500 Subject: [PATCH 51/59] Commenting out CMI pipeline test that wasn't meant to be in GATK repository (why was this merged??) --- .../gatk/walkers/annotator/VariantAnnotatorEngine.java | 6 +++++- .../gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index ee4f77752..725097ddc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -277,8 +277,12 @@ public class VariantAnnotatorEngine { if ( expression.fieldName.equals("ID") ) { if ( vc.hasID() ) infoAnnotations.put(expression.fullName, vc.getID()); + } else if (expression.fieldName.equals("ALT")) { + infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString()); + } else if ( vc.hasAttribute(expression.fieldName) ) { - infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 879a46ab0..22ed95365 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -190,8 +190,8 @@ public class UnifiedGenotyperEngine { final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); - // else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) - // results.add(generateEmptyContext(tracker, refContext, null, rawContext)); + else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) + results.add(generateEmptyContext(tracker, refContext, null, rawContext)); } } From a68e6810c90711d88d2a829d27818034f721eb12 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 14 Nov 2012 14:45:15 -0500 Subject: [PATCH 52/59] Back off experimental code that escaped last commit, not for general use yet --- .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 22ed95365..80bc04845 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -190,8 +190,9 @@ public class UnifiedGenotyperEngine { final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); - else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) - results.add(generateEmptyContext(tracker, refContext, null, rawContext)); +// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set +// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) +// results.add(generateEmptyContext(tracker, refContext, null, rawContext)); } } From b70fd4a2426a21f375776c1de54540a562539423 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 14 Nov 2012 11:08:48 -0500 Subject: [PATCH 53/59] Initial testing of the Active Region Traversal contract - TODO: many more tests and test cases --- .../traversals/TraverseActiveRegions.java | 5 +- .../utils/activeregion/ActivityProfile.java | 5 + .../traversals/TraverseActiveRegionsTest.java | 126 ++++++++++++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index a2c37944a..4fe83f331 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -34,6 +34,9 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); private final LinkedHashSet myReads = new LinkedHashSet(); + // package access for unit testing + ActivityProfile profile; + @Override public String getTraversalUnits() { return "active regions"; @@ -53,7 +56,7 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index e96eb843d..38cfbb38d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -103,6 +103,11 @@ public class ActivityProfile { isActiveList.add(result); } + // for unit testing + public List getActiveList() { + return isActiveList; + } + public int size() { return isActiveList.size(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java new file mode 100644 index 000000000..8740a8b68 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -0,0 +1,126 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.testng.Assert; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.executive.WindowMaker; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/13/12 + * Time: 2:47 PM + */ +public class TraverseActiveRegionsTest extends BaseTest { + + private class DummyActiveRegionWalker extends ActiveRegionWalker { + private final double prob; + + public DummyActiveRegionWalker() { + this.prob = 1.0; + } + + @Override + public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return new ActivityProfileResult(ref.getLocus(), prob); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + private final TraverseActiveRegions t = new TraverseActiveRegions(); + + private IndexedFastaSequenceFile reference; + private GenomeLocParser genomeLocParser; + private ActiveRegionWalker walker; + + @BeforeClass + private void init() throws FileNotFoundException { + reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); + SAMSequenceDictionary dictionary = reference.getSequenceDictionary(); + genomeLocParser = new GenomeLocParser(dictionary); + } + + @Test + public void testAllIntervalsSeen() throws Exception { + List intervals = new ArrayList(); + GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1); + intervals.add(interval); + + LocusShardDataProvider dataProvider = createDataProvider(intervals); + + t.traverse(walker, dataProvider, 0); + + boolean allGenomeLocsSeen = true; + for (GenomeLoc loc : intervals) { + boolean thisGenomeLocSeen = false; + for (ActivityProfileResult active : t.profile.getActiveList()) { + if (loc.equals(active.getLoc())) { + thisGenomeLocSeen = true; + break; + } + } + if (!thisGenomeLocSeen) { + allGenomeLocsSeen = false; + break; + } + } + + Assert.assertTrue(allGenomeLocsSeen, "Some intervals missing from activity profile"); + } + + private LocusShardDataProvider createDataProvider(List intervals) { + walker = new DummyActiveRegionWalker(); + + StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList()); + Shard shard = new MockLocusShard(genomeLocParser, intervals); + WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + //engine.setReferenceDataSource(reference); + engine.setGenomeLocParser(genomeLocParser); + t.initialize(engine); + + return new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList()); + } +} From 855a68ae39c1f31623f9561af4f9e2b4e85a14ab Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 19 Nov 2012 08:06:58 -0500 Subject: [PATCH 54/59] Testing out the new github-hosted repos. Please ignore. --- testfile | 1 + 1 file changed, 1 insertion(+) create mode 100644 testfile diff --git a/testfile b/testfile new file mode 100644 index 000000000..6de7b8c69 --- /dev/null +++ b/testfile @@ -0,0 +1 @@ +This is a test file. From 2a16d6fa55140ff6835c57a737586fbb18d0452b Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 19 Nov 2012 08:07:19 -0500 Subject: [PATCH 55/59] Revert "Testing out the new github-hosted repos. Please ignore." This reverts commit b6bf66cd088754e7fd3d5f105ca8b2551237f183. --- testfile | 1 - 1 file changed, 1 deletion(-) delete mode 100644 testfile diff --git a/testfile b/testfile deleted file mode 100644 index 6de7b8c69..000000000 --- a/testfile +++ /dev/null @@ -1 +0,0 @@ -This is a test file. From 9fc63efc307475f6b36ffad7e425b90bcab817cf Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 19 Nov 2012 09:34:15 -0500 Subject: [PATCH 56/59] Second test of new repos. Please ignore. --- testfile | 1 + 1 file changed, 1 insertion(+) create mode 100644 testfile diff --git a/testfile b/testfile new file mode 100644 index 000000000..524acfffa --- /dev/null +++ b/testfile @@ -0,0 +1 @@ +Test file From e1a5c3ce7a97e8eb92c64cc144340c81d01c9903 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 19 Nov 2012 09:34:32 -0500 Subject: [PATCH 57/59] Revert "Second test of new repos. Please ignore." This reverts commit 077532d870ddf53ec514b98f14534ca7dbf55331. --- testfile | 1 - 1 file changed, 1 deletion(-) delete mode 100644 testfile diff --git a/testfile b/testfile deleted file mode 100644 index 524acfffa..000000000 --- a/testfile +++ /dev/null @@ -1 +0,0 @@ -Test file