From 2d4b00833b3d0e26b2bf9d8f016f4001bc86fcce Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sun, 9 Sep 2012 20:35:45 -0400 Subject: [PATCH 01/19] Bug fix for logging likelihoods in new read allele map: reads which were filtered out were being excluded from map, but they should be included in annotations --- .../LikelihoodCalculationEngine.java | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index d04c1a9e2..69af66185 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -346,20 +346,15 @@ public class LikelihoodCalculationEngine { } } } -/* // add all filtered reads to the NO_CALL list because they weren't given any likelihoods - List readList = alleleReadMap.get(Allele.NO_CALL); - if( readList == null ) { - readList = new ArrayList(); - alleleReadMap.put(Allele.NO_CALL, readList); - } - */ - /* for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods + for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - readList.add(read); + for( final Allele a : call.getFirst().getAlleles() ) + likelihoodMap.add(read,a,0.0); } } - */ + returnMap.put(sample.getKey(), likelihoodMap); } From ac8a4dfc2d57c4797452d2229bda6ccdcb439763 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 10 Sep 2012 15:04:06 -0400 Subject: [PATCH 02/19] The comprehensive LIBS unit test is now truly comprehensive (or it would be if LIBS wasn't busted). The test can handle a read with any arbitrary legal CIGAR and iterates over the elements/bases in time with the real LIBS, failing if there are any differences. I've left the few hard-coded CIGARs in there for now with a note to move to all possible permutations once we move to fix LIBS (otherwise the tests would fail now). --- .../LocusIteratorByStateUnitTest.java | 172 ++++++++++++++---- 1 file changed, 132 insertions(+), 40 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index fbc063ab6..a5ead5665 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.gatk.iterators; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; @@ -40,7 +38,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } - private final LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + private LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); } @@ -262,45 +260,36 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // comprehensive LIBS/PileupElement tests // //////////////////////////////////////////// - private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; - private static final int IS_BEFORE_DELETION_START_FLAG = 2; - private static final int IS_AFTER_DELETED_BASE_FLAG = 4; - private static final int IS_AFTER_DELETION_END_FLAG = 8; - private static final int IS_BEFORE_INSERTION_FLAG = 16; - private static final int IS_AFTER_INSERTION_FLAG = 32; - private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; - private static class LIBSTest { final String cigar; final int readLength; - final List offsets; - final List flags; - private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { + private LIBSTest(final String cigar, final int readLength) { this.cigar = cigar; this.readLength = readLength; - this.offsets = offsets; - this.flags = flags; } } @DataProvider(name = "LIBSTest") public Object[][] createLIBSTestData() { + + //TODO -- when LIBS is fixed this should be replaced to provide all possible permutations of CIGAR strings + return new Object[][]{ - {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, - {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, - {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, - {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("1I", 1)}, + {new LIBSTest("10I", 10)}, + {new LIBSTest("2M2I2M", 6)}, + {new LIBSTest("2M2I", 4)}, //TODO -- uncomment these when LIBS is fixed //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, - {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, - {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, - {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, - {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} + //{new LIBSTest("1M2D2M", 3)}, + {new LIBSTest("1S1M", 2)}, + {new LIBSTest("1M1S", 2)}, + {new LIBSTest("1S1M1I", 3)} }; } @@ -315,26 +304,24 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // create the iterator by state with the fake reads and fake records li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + final LIBS_position tester = new LIBS_position(read); - int offset = 0; while ( li.hasNext() ) { AlignmentContext alignmentContext = li.next(); ReadBackedPileup p = alignmentContext.getBasePileup(); Assert.assertTrue(p.getNumberOfElements() == 1); PileupElement pe = p.iterator().next(); - final int flag = params.flags.get(offset); - Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); - Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); - Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); - Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); - Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); - Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); - Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); + tester.stepForwardOnGenome(); - Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); - - offset++; + Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase); + Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart); + Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase); + Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd); + Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion); + Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion); + Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip); + Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset()); } } @@ -366,9 +353,7 @@ class FakeCloseableIterator implements CloseableIterator { } @Override - public void close() { - return; - } + public void close() {} @Override public boolean hasNext() { @@ -385,3 +370,110 @@ class FakeCloseableIterator implements CloseableIterator { throw new UnsupportedOperationException("Don't remove!"); } } + + +final class LIBS_position { + + SAMRecord read; + + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; + + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + + boolean sawMop = false; + + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } + + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } + + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) + return false; + + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; + + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; + break; + + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } + + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; + } + + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } + + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + } +} From 0b717e2e2e82f9e2a3e8c1db2c18ca9f947b27b6 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Mon, 10 Sep 2012 15:32:41 -0400 Subject: [PATCH 04/19] Separated out the DoC calculations from the XHMM pipeline, so that CalcDepthOfCoverage can be used for calculating joint coverage on a per-base accounting over multiple samples (e.g., family samples) --- .../IntervalOverlappingRODsFromStream.java | 143 ---- .../gatk/downsampling/DownsampleType.java | 14 - .../gatk/downsampling/DownsamplingMethod.java | 153 ----- .../FractionalDownsamplerFactory.java | 45 -- .../downsampling/LevelingDownsampler.java | 212 ------ .../PerSampleDownsamplingReadsIterator.java | 202 ------ .../downsampling/ReadsDownsamplerFactory.java | 37 - .../ReservoirDownsamplerFactory.java | 45 -- .../SimplePositionalDownsampler.java | 169 ----- .../SimplePositionalDownsamplerFactory.java | 45 -- .../iterators/LegacyDownsampleIterator.java | 52 -- .../LocusIteratorByStateExperimental.java | 649 ------------------ .../sting/gatk/iterators/ReadTransformer.java | 144 ---- .../gatk/iterators/ReadTransformersMode.java | 28 - .../sting/gatk/samples/Trio.java | 45 -- .../gatk/traversals/TraverseLociBase.java | 103 --- .../gatk/traversals/TraverseLociLinear.java | 47 -- .../gatk/traversals/TraverseLociNano.java | 205 ------ .../gatk/traversals/TraverseReadsNano.java | 234 ------- .../sting/gatk/walkers/NanoSchedulable.java | 31 - .../fasta/FastaAlternateReferenceMaker.java | 133 ---- .../walkers/fasta/FastaReferenceMaker.java | 127 ---- .../sting/utils/baq/BAQReadTransformer.java | 49 -- .../utils/baq/ReadTransformingIterator.java | 44 -- .../nanoScheduler/BlockingQueueValue.java | 82 --- .../utils/nanoScheduler/FutureValue.java | 45 -- .../utils/nanoScheduler/InputProducer.java | 62 -- .../sting/utils/nanoScheduler/MapResult.java | 36 - .../utils/nanoScheduler/NSMapFunction.java | 19 - .../nanoScheduler/NSProgressFunction.java | 12 - .../utils/nanoScheduler/NSReduceFunction.java | 18 - .../utils/nanoScheduler/NanoScheduler.java | 392 ----------- .../utils/nanoScheduler/ReducerThread.java | 65 -- .../sting/utils/recalibration/BQSRMode.java | 30 - .../recalibration/BQSRReadTransformer.java | 40 -- .../sam/ArtificialMultiSampleReadStream.java | 86 --- .../sam/ArtificialSingleSampleReadStream.java | 212 ------ ...ificialSingleSampleReadStreamAnalyzer.java | 281 -------- .../EfficiencyMonitoringThreadFactory.java | 158 ----- .../utils/threading/NamedThreadFactory.java | 26 - .../threading/ThreadEfficiencyMonitor.java | 207 ------ .../InvalidArgumentIntegrationTest.java | 41 -- .../LevelingDownsamplerUnitTest.java | 163 ----- ...mpleDownsamplingReadsIteratorUnitTest.java | 298 -------- ...ificialSingleSampleReadStreamAnalyzer.java | 126 ---- .../ReservoirDownsamplerUnitTest.java | 129 ---- .../SimplePositionalDownsamplerUnitTest.java | 330 --------- ...usIteratorByStateExperimentalUnitTest.java | 546 --------------- .../LegacyReservoirDownsamplerUnitTest.java | 166 ----- .../nanoScheduler/InputProducerUnitTest.java | 71 -- .../nanoScheduler/NanoSchedulerUnitTest.java | 182 ----- .../nanoScheduler/ReducerThreadUnitTest.java | 94 --- ...ificialSingleSampleReadStreamUnitTest.java | 161 ----- ...ciencyMonitoringThreadFactoryUnitTest.java | 184 ----- 54 files changed, 7218 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java delete mode 100644 public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java delete mode 100755 public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java deleted file mode 100644 index 1e39d6836..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java +++ /dev/null @@ -1,143 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.providers; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.util.Collection; -import java.util.LinkedList; -import java.util.ListIterator; - -/** - * Key algorithmic helper for ReadBasedReferenceOrderedData - * - * Takes a single iterator of features, and provides a single capability that returns - * the list of RODs that overlap an interval. Allows sequential getOverlapping calls - * from intervals provided that these intervals always have increasing getStart() values. - * - */ -class IntervalOverlappingRODsFromStream { - /** - * Only held for QC purposes - */ - GenomeLoc lastQuery = null; - - private final String name; - private final LinkedList currentFeatures = new LinkedList(); - private final PeekableIterator futureFeatures; - - /** - * Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and - * returns RODRecordLists having name - * - * @param name - * @param futureFeatures - */ - IntervalOverlappingRODsFromStream(final String name, final PeekableIterator futureFeatures) { - if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null"); - - this.name = name; - this.futureFeatures = futureFeatures; - } - - /** - * Get the list of RODs overlapping loc from this stream of RODs. - * - * Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart - * - * @param loc the interval to query - * @return a non-null RODRecordList containing the overlapping RODs, which may be empty - */ - @Ensures({"overlaps(loc, result)", - "! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)", - "result != null"}) - public RODRecordList getOverlapping(final GenomeLoc loc) { - if ( lastQuery != null && loc.getStart() < lastQuery.getStart() ) - throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery)); - - trimCurrentFeaturesToLoc(loc); - readOverlappingFutureFeatures(loc); - return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc); - } - - - /** - * For contract assurance. Checks that all bindings in loc overlap - * - * @param loc - * @param bindings - * @return - */ - @Requires({"loc != null", "bindings != null"}) - private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) { - for ( final GATKFeature feature : bindings ) - if ( ! feature.getLocation().overlapsP(loc) ) - return false; - return true; - } - - /** - * Subset the features in all to those that overlap with loc - * - * The current features list contains everything read that cannot be thrown away yet, but not - * everything in there necessarily overlaps with loc. Subset to just those that do overlap - * - * @param loc the location that features must overlap - * @param all the list of all features - * @return a subset of all that overlaps with loc - */ - @Requires({"loc != null", "all != null"}) - @Ensures("result.size() <= all.size()") - private Collection subsetToOverlapping(final GenomeLoc loc, final Collection all) { - final LinkedList overlapping = new LinkedList(); - for ( final GATKFeature feature : all ) - if ( feature.getLocation().overlapsP(loc) ) - overlapping.add(feature); - return overlapping; - } - - /** - * Update function. Remove all elements of currentFeatures that end before loc - * - * @param loc the location to use - */ - @Requires("loc != null") - @Ensures("currentFeatures.size() <= old(currentFeatures.size())") - private void trimCurrentFeaturesToLoc(final GenomeLoc loc) { - final ListIterator it = currentFeatures.listIterator(); - while ( it.hasNext() ) { - final GATKFeature feature = it.next(); - if ( feature.getLocation().isBefore(loc) ) - it.remove(); - } - } - - /** - * Update function: Read all elements from futureFeatures that overlap with loc - * - * Stops at the first element that starts before the end of loc, or the stream empties - * - * @param loc - */ - @Requires("loc != null") - @Ensures("currentFeatures.size() >= old(currentFeatures.size())") - private void readOverlappingFutureFeatures(final GenomeLoc loc) { - while ( futureFeatures.hasNext() ) { - final GenomeLoc nextLoc = futureFeatures.peek().getLocation(); - if ( nextLoc.isBefore(loc) ) { - futureFeatures.next(); // next rod element is before loc, throw it away and keep looking - } else if ( nextLoc.isPast(loc) ) { - break; // next element is past loc, stop looking but don't pop it - } else if ( nextLoc.overlapsP(loc) ) { - // add overlapping elements to our current features, removing from stream - for ( final GATKFeature feature : futureFeatures.next() ) { - currentFeatures.add(feature); - } - } - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java deleted file mode 100644 index c3d17436a..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java +++ /dev/null @@ -1,14 +0,0 @@ -package org.broadinstitute.sting.gatk.downsampling; - -/** - * Type of downsampling method to invoke. - * - * @author hanna - * @version 0.1 - */ - -public enum DownsampleType { - NONE, - ALL_READS, - BY_SAMPLE -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java deleted file mode 100644 index ae1d98ce0..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.exceptions.UserException; - -/** - * Describes the method for downsampling reads at a given locus. - */ - -public class DownsamplingMethod { - /** - * Type of downsampling to perform. - */ - public final DownsampleType type; - - /** - * Actual downsampling target is specified as an integer number of reads. - */ - public final Integer toCoverage; - - /** - * Actual downsampling target is specified as a fraction of total available reads. - */ - public final Double toFraction; - - /** - * Use the new experimental downsampling? - */ - public final boolean useExperimentalDownsampling; - - /** - * Expresses no downsampling applied at all. - */ - public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false); - - /** - * Default type to use if no type is specified - */ - public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; - - /** - * Default target coverage for locus-based traversals - */ - public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000; - - public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) { - this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; - this.toCoverage = toCoverage; - this.toFraction = toFraction; - this.useExperimentalDownsampling = useExperimentalDownsampling; - - if ( type == DownsampleType.NONE ) { - toCoverage = null; - toFraction = null; - } - - validate(); - } - - private void validate() { - // Can't leave toFraction and toCoverage null unless type is NONE - if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null ) - throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling."); - - // Fraction and coverage cannot both be specified. - if ( toFraction != null && toCoverage != null ) - throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one."); - - // toCoverage must be > 0 when specified - if ( toCoverage != null && toCoverage <= 0 ) { - throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage"); - } - - // toFraction must be >= 0.0 and <= 1.0 when specified - if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { - throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); - } - - // Some restrictions only exist for the old downsampling implementation: - if ( ! useExperimentalDownsampling ) { - // By sample downsampling does not work with a fraction of reads in the old downsampling implementation - if( type == DownsampleType.BY_SAMPLE && toFraction != null ) - throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method"); - } - - // Some restrictions only exist for the new downsampling implementation: - if ( useExperimentalDownsampling ) { - if ( type == DownsampleType.ALL_READS && toCoverage != null ) { - throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation"); - } - } - } - - public String toString() { - StringBuilder builder = new StringBuilder("Downsampling Settings: "); - - if ( type == DownsampleType.NONE ) { - builder.append("No downsampling"); - } - else { - builder.append(String.format("Method: %s ", type)); - - if ( toCoverage != null ) { - builder.append(String.format("Target Coverage: %d ", toCoverage)); - } - else { - builder.append(String.format("Target Fraction: %.2f ", toFraction)); - } - - if ( useExperimentalDownsampling ) { - builder.append("Using Experimental Downsampling"); - } - } - - return builder.toString(); - } - - public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) { - if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) { - return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE, - null, useExperimentalDownsampling); - } - else { - return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java deleted file mode 100644 index 7a7c9e91e..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMRecord; - -/** - * Factory for creating FractionalDownsamplers on demand - * - * @author David Roazen - */ -public class FractionalDownsamplerFactory implements ReadsDownsamplerFactory { - - private double fraction; - - public FractionalDownsamplerFactory( double fraction ) { - this.fraction = fraction; - } - - public ReadsDownsampler newInstance() { - return new FractionalDownsampler(fraction); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java deleted file mode 100644 index 73d69140d..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.*; - -/** - * Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from - * the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling - * does not occur until all Lists have been submitted and signalEndOfInput() is called. - * - * The Lists should be LinkedLists for maximum efficiency during item removal, however other - * kinds of Lists are also accepted (albeit at a slight performance penalty). - * - * Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface, - * the Lists need not contain reads. However this downsampler may not be wrapped within one of the - * DownsamplingReadsIterators - * - * @param the List type representing the stacks to be leveled - * @param the type of the elements of each List - * - * @author David Roazen - */ -public class LevelingDownsampler, E> implements Downsampler { - - private int targetSize; - - private List groups; - - private boolean groupsAreFinalized; - - private int numDiscardedItems; - - /** - * Construct a LevelingDownsampler - * - * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed - * this value -- if it does, items are removed from Lists evenly until the total size - * is <= this value - */ - public LevelingDownsampler( int targetSize ) { - this.targetSize = targetSize; - clear(); - reset(); - } - - public void submit( T item ) { - groups.add(item); - } - - public void submit( Collection items ){ - groups.addAll(items); - } - - public boolean hasFinalizedItems() { - return groupsAreFinalized && groups.size() > 0; - } - - public List consumeFinalizedItems() { - if ( ! hasFinalizedItems() ) { - return new ArrayList(); - } - - // pass by reference rather than make a copy, for speed - List toReturn = groups; - clear(); - return toReturn; - } - - public boolean hasPendingItems() { - return ! groupsAreFinalized && groups.size() > 0; - } - - public T peekFinalized() { - return hasFinalizedItems() ? groups.get(0) : null; - } - - public T peekPending() { - return hasPendingItems() ? groups.get(0) : null; - } - - public int getNumberOfDiscardedItems() { - return numDiscardedItems; - } - - public void signalEndOfInput() { - levelGroups(); - groupsAreFinalized = true; - } - - public void clear() { - groups = new ArrayList(); - groupsAreFinalized = false; - } - - public void reset() { - numDiscardedItems = 0; - } - - private void levelGroups() { - int totalSize = 0; - int[] groupSizes = new int[groups.size()]; - int currentGroupIndex = 0; - - for ( T group : groups ) { - groupSizes[currentGroupIndex] = group.size(); - totalSize += groupSizes[currentGroupIndex]; - currentGroupIndex++; - } - - if ( totalSize <= targetSize ) { - return; // no need to eliminate any items - } - - // We will try to remove exactly this many items, however we will refuse to allow any - // one group to fall below size 1, and so might end up removing fewer items than this - int numItemsToRemove = totalSize - targetSize; - - currentGroupIndex = 0; - int numConsecutiveUmodifiableGroups = 0; - - // Continue until we've either removed all the items we wanted to, or we can't - // remove any more items without violating the constraint that all groups must - // be left with at least one item - while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { - if ( groupSizes[currentGroupIndex] > 1 ) { - groupSizes[currentGroupIndex]--; - numItemsToRemove--; - numConsecutiveUmodifiableGroups = 0; - } - else { - numConsecutiveUmodifiableGroups++; - } - - currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length; - } - - // Now we actually go through and reduce each group to its new count as specified in groupSizes - currentGroupIndex = 0; - for ( T group : groups ) { - downsampleOneGroup(group, groupSizes[currentGroupIndex]); - currentGroupIndex++; - } - } - - private void downsampleOneGroup( T group, int numItemsToKeep ) { - if ( numItemsToKeep >= group.size() ) { - return; - } - - numDiscardedItems += group.size() - numItemsToKeep; - - BitSet itemsToKeep = new BitSet(group.size()); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) { - itemsToKeep.set(selectedIndex); - } - - int currentIndex = 0; - - // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator - if ( group instanceof LinkedList ) { - Iterator iter = group.iterator(); - while ( iter.hasNext() ) { - iter.next(); - - if ( ! itemsToKeep.get(currentIndex) ) { - iter.remove(); - } - - currentIndex++; - } - } - // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather - // than suffer O(n^2) of item shifting - else { - List keptItems = new ArrayList(numItemsToKeep); - - for ( E item : group ) { - if ( itemsToKeep.get(currentIndex) ) { - keptItems.add(item); - } - currentIndex++; - } - group.clear(); - group.addAll(keptItems); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java deleted file mode 100644 index 8b2034460..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMRecordComparator; -import net.sf.samtools.SAMRecordCoordinateComparator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; - -import java.util.*; - - -/** - * StingSAMIterator wrapper around our generic reads downsampler interface - * that downsamples reads for each sample independently, and then re-assembles - * the reads back into a single merged stream. - * - * @author David Roazen - */ -public class PerSampleDownsamplingReadsIterator implements StingSAMIterator { - - private StingSAMIterator nestedSAMIterator; - private ReadsDownsamplerFactory downsamplerFactory; - private Map> perSampleDownsamplers; - private PriorityQueue orderedDownsampledReadsCache; - private SAMRecord nextRead = null; - private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator(); - private SAMRecord earliestPendingRead = null; - private ReadsDownsampler earliestPendingDownsampler = null; - - // Initial size of our cache of finalized reads - private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096; - - // The number of positional changes that can occur in the read stream before all downsamplers - // should be informed of the current position (guards against samples with relatively sparse reads - // getting stuck in a pending state): - private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value - - /** - * @param iter wrapped iterator from which this iterator will pull reads - * @param downsamplerFactory factory used to create new downsamplers as needed - */ - public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory downsamplerFactory ) { - nestedSAMIterator = iter; - this.downsamplerFactory = downsamplerFactory; - perSampleDownsamplers = new HashMap>(); - orderedDownsampledReadsCache = new PriorityQueue(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator); - - advanceToNextRead(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if ( nextRead == null ) { - throw new NoSuchElementException("next() called when there are no more items"); - } - - SAMRecord toReturn = nextRead; - advanceToNextRead(); - - return toReturn; - } - - private void advanceToNextRead() { - if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { - nextRead = null; - } - else { - nextRead = orderedDownsampledReadsCache.poll(); - } - } - - private boolean readyToReleaseReads() { - if ( orderedDownsampledReadsCache.isEmpty() ) { - return false; - } - - return earliestPendingRead == null || - readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0; - } - - private void updateEarliestPendingRead( ReadsDownsampler currentDownsampler ) { - // If there is no recorded earliest pending read and this downsampler has pending items, - // then this downsampler's first pending item becomes the new earliest pending read: - if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) { - earliestPendingRead = currentDownsampler.peekPending(); - earliestPendingDownsampler = currentDownsampler; - } - // In all other cases, we only need to update the earliest pending read when the downsampler - // associated with it experiences a change in its pending reads, since by assuming a sorted - // read stream we're assured that each downsampler's earliest pending read will only increase - // in genomic position over time. - // - // TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers - // TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))), - // TODO: but need to verify this empirically. - else if ( currentDownsampler == earliestPendingDownsampler && - (! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) { - - earliestPendingRead = null; - earliestPendingDownsampler = null; - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - if ( perSampleDownsampler.hasPendingItems() && - (earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) { - - earliestPendingRead = perSampleDownsampler.peekPending(); - earliestPendingDownsampler = perSampleDownsampler; - } - } - } - } - - private boolean fillDownsampledReadsCache() { - SAMRecord prevRead = null; - int numPositionalChanges = 0; - - // Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue - // can be released without violating global sort order - while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) { - SAMRecord read = nestedSAMIterator.next(); - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - - ReadsDownsampler thisSampleDownsampler = perSampleDownsamplers.get(sampleName); - if ( thisSampleDownsampler == null ) { - thisSampleDownsampler = downsamplerFactory.newInstance(); - perSampleDownsamplers.put(sampleName, thisSampleDownsampler); - } - - thisSampleDownsampler.submit(read); - updateEarliestPendingRead(thisSampleDownsampler); - - if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) { - numPositionalChanges++; - } - - // If the number of times we've changed position exceeds a certain threshold, inform all - // downsamplers of the current position in the read stream. This is to prevent downsamplers - // for samples with sparser reads than others from getting stuck too long in a pending state. - if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) { - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - perSampleDownsampler.signalNoMoreReadsBefore(read); - updateEarliestPendingRead(perSampleDownsampler); - } - } - - prevRead = read; - } - - if ( ! nestedSAMIterator.hasNext() ) { - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - perSampleDownsampler.signalEndOfInput(); - } - earliestPendingRead = null; - earliestPendingDownsampler = null; - } - - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - if ( perSampleDownsampler.hasFinalizedItems() ) { - orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems()); - } - } - - return readyToReleaseReads(); - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - public void close() { - nestedSAMIterator.close(); - } - - public Iterator iterator() { - return this; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java deleted file mode 100644 index 2fa32497b..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMRecord; - -/** - * A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular - * downsampler, all sharing the same construction parameters. - * - * @author David Roazen - */ -public interface ReadsDownsamplerFactory { - public ReadsDownsampler newInstance(); -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java deleted file mode 100644 index 040f0c788..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMRecord; - -/** - * Factory for creating ReservoirDownsamplers on demand - * - * @author David Roazen - */ -public class ReservoirDownsamplerFactory implements ReadsDownsamplerFactory { - - private int targetSampleSize; - - public ReservoirDownsamplerFactory( int targetSampleSize ) { - this.targetSampleSize = targetSampleSize; - } - - public ReadsDownsampler newInstance() { - return new ReservoirDownsampler(targetSampleSize); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java deleted file mode 100644 index 30affc2b3..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMRecord; - -import java.util.*; - -/** - * Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage - * using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time. - * - * @author David Roazen - */ -public class SimplePositionalDownsampler implements ReadsDownsampler { - - private int targetCoverage; - - private ReservoirDownsampler reservoir; - - private int currentContigIndex; - - private int currentAlignmentStart; - - private boolean positionEstablished; - - private boolean unmappedReadsReached; - - private ArrayList finalizedReads; - - private int numDiscardedItems; - - /** - * Construct a SimplePositionalDownsampler - * - * @param targetCoverage Maximum number of reads that may share any given alignment start position - */ - public SimplePositionalDownsampler( int targetCoverage ) { - this.targetCoverage = targetCoverage; - reservoir = new ReservoirDownsampler(targetCoverage); - finalizedReads = new ArrayList(); - clear(); - reset(); - } - - public void submit( T newRead ) { - updatePositionalState(newRead); - - if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream - finalizedReads.add(newRead); - } - else { - int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); - reservoir.submit(newRead); - numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems; - } - } - - public void submit( Collection newReads ) { - for ( T read : newReads ) { - submit(read); - } - } - - public boolean hasFinalizedItems() { - return finalizedReads.size() > 0; - } - - public List consumeFinalizedItems() { - // pass by reference rather than make a copy, for speed - List toReturn = finalizedReads; - finalizedReads = new ArrayList(); - return toReturn; - } - - public boolean hasPendingItems() { - return reservoir.hasFinalizedItems(); - } - - public T peekFinalized() { - return finalizedReads.isEmpty() ? null : finalizedReads.get(0); - } - - public T peekPending() { - return reservoir.peekFinalized(); - } - - public int getNumberOfDiscardedItems() { - return numDiscardedItems; - } - - public void signalEndOfInput() { - finalizeReservoir(); - } - - public void clear() { - reservoir.clear(); - reservoir.reset(); - finalizedReads.clear(); - positionEstablished = false; - unmappedReadsReached = false; - } - - public void reset() { - numDiscardedItems = 0; - } - - public boolean requiresCoordinateSortOrder() { - return true; - } - - public void signalNoMoreReadsBefore( T read ) { - updatePositionalState(read); - } - - private void updatePositionalState( T newRead ) { - if ( readIsPastCurrentPosition(newRead) ) { - if ( reservoir.hasFinalizedItems() ) { - finalizeReservoir(); - } - - setCurrentPosition(newRead); - - if ( newRead.getReadUnmappedFlag() ) { - unmappedReadsReached = true; - } - } - } - - private void setCurrentPosition( T read ) { - currentContigIndex = read.getReferenceIndex(); - currentAlignmentStart = read.getAlignmentStart(); - positionEstablished = true; - } - - private boolean readIsPastCurrentPosition( T read ) { - return ! positionEstablished || - read.getReferenceIndex() > currentContigIndex || - read.getAlignmentStart() > currentAlignmentStart || - (read.getReadUnmappedFlag() && ! unmappedReadsReached); - } - - private void finalizeReservoir() { - finalizedReads.addAll(reservoir.consumeFinalizedItems()); - reservoir.reset(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java deleted file mode 100644 index fcc18b16b..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMRecord; - -/** - * Factory for creating SimplePositionalDownsamplers on demand - * - * @author David Roazen - */ -public class SimplePositionalDownsamplerFactory implements ReadsDownsamplerFactory { - - private int targetCoverage; - - public SimplePositionalDownsamplerFactory( int targetCoverage ) { - this.targetCoverage = targetCoverage; - } - - public ReadsDownsampler newInstance() { - return new SimplePositionalDownsampler(targetCoverage); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java deleted file mode 100755 index c0de06b49..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.broadinstitute.sting.gatk.iterators; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; - -import java.util.Iterator; - - -public class LegacyDownsampleIterator implements StingSAMIterator { - - StingSAMIterator it; - int cutoff; - SAMRecord next; - - public LegacyDownsampleIterator(StingSAMIterator it, double fraction) { - this.it = it; - cutoff = (int)(fraction * 10000); - next = getNextRecord(); - } - - public boolean hasNext() { - return next != null; - } - - public SAMRecord next() { - SAMRecord result = next; - next = getNextRecord(); - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - private SAMRecord getNextRecord() { - while ( true ) { - if ( !it.hasNext() ) - return null; - SAMRecord rec = it.next(); - if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoff ) - return rec; - } - } - - public void close() { - it.close(); - } - - public Iterator iterator() { - return this; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java deleted file mode 100755 index 557cbd009..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java +++ /dev/null @@ -1,649 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.iterators; - -import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.ReadProperties; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.Downsampler; -import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.util.*; - -/** - * Iterator that traverses a SAM File, accumulating information on a per-locus basis - */ -public class LocusIteratorByStateExperimental extends LocusIterator { - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(LocusIteratorByState.class); - - // ----------------------------------------------------------------------------------------------------------------- - // - // member fields - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Used to create new GenomeLocs. - */ - private final GenomeLocParser genomeLocParser; - private final ArrayList samples; - private final ReadStateManager readStates; - - protected static class SAMRecordState { - SAMRecord read; - int readOffset = -1; // how far are we offset from the start of the read bases? - int genomeOffset = -1; // how far are we offset from the alignment start on the genome? - - Cigar cigar = null; - int cigarOffset = -1; - CigarElement curElement = null; - int nCigarElements = 0; - - int cigarElementCounter = -1; // how far are we into a single cigarElement - - // The logical model for generating extended events is as follows: the "record state" implements the traversal - // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This - // can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the - // deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or - // if the deletion just started *right before* the current reference base the record state is pointing to upon the return from - // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended - // events immediately preceding the current reference base). - - public SAMRecordState(SAMRecord read) { - this.read = read; - cigar = read.getCigar(); - nCigarElements = cigar.numCigarElements(); - - //System.out.printf("Creating a SAMRecordState: %s%n", this); - } - - public SAMRecord getRead() { - return read; - } - - /** - * What is our current offset in the read's bases that aligns us with the reference genome? - * - * @return - */ - public int getReadOffset() { - return readOffset; - } - - /** - * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? - * - * @return - */ - public int getGenomeOffset() { - return genomeOffset; - } - - public int getGenomePosition() { - return read.getAlignmentStart() + getGenomeOffset(); - } - - public GenomeLoc getLocation(GenomeLocParser genomeLocParser) { - return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition()); - } - - public CigarOperator getCurrentCigarOperator() { - return curElement.getOperator(); - } - - public String toString() { - return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); - } - - public CigarElement peekForwardOnGenome() { - return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ); - } - - public CigarElement peekBackwardOnGenome() { - return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement ); - } - - - public CigarOperator stepForwardOnGenome() { - // we enter this method with readOffset = index of the last processed base on the read - // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion - - - if (curElement == null || ++cigarElementCounter > curElement.getLength()) { - cigarOffset++; - if (cigarOffset < nCigarElements) { - curElement = cigar.getCigarElement(cigarOffset); - cigarElementCounter = 0; - // next line: guards against cigar elements of length 0; when new cigar element is retrieved, - // we reenter in order to re-check cigarElementCounter against curElement's length - return stepForwardOnGenome(); - } else { - if (curElement != null && curElement.getOperator() == CigarOperator.D) - throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); - - // Reads that contain indels model the genomeOffset as the following base in the reference. Because - // we fall into this else block only when indels end the read, increment genomeOffset such that the - // current offset of this read is the next ref base after the end of the indel. This position will - // model a point on the reference somewhere after the end of the read. - genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: - // we do step forward on the ref, and by returning null we also indicate that we are past the read end. - - return null; - } - } - - boolean done = false; - switch (curElement.getOperator()) { - case H: // ignore hard clips - case P: // ignore pads - cigarElementCounter = curElement.getLength(); - break; - case I: // insertion w.r.t. the reference - case S: // soft clip - cigarElementCounter = curElement.getLength(); - readOffset += curElement.getLength(); - break; - case D: // deletion w.r.t. the reference - if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string - throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); - // should be the same as N case - genomeOffset++; - done = true; - break; - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - genomeOffset++; - done = true; - break; - case M: - case EQ: - case X: - readOffset++; - genomeOffset++; - done = true; - break; - default: - throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); - } - - return done ? curElement.getOperator() : stepForwardOnGenome(); - } - } - - //final boolean DEBUG = false; - //final boolean DEBUG2 = false && DEBUG; - private ReadProperties readInfo; - private AlignmentContext nextAlignmentContext; - private boolean performLevelingDownsampling; - - // ----------------------------------------------------------------------------------------------------------------- - // - // constructors and other basic operations - // - // ----------------------------------------------------------------------------------------------------------------- - - public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { - this.readInfo = readInformation; - this.genomeLocParser = genomeLocParser; - this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator); - - this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null && - readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && - readInfo.getDownsamplingMethod().toCoverage != null; - - // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when - // there's no read data. So we need to throw this error only when samIterator.hasNext() is true - if (this.samples.isEmpty() && samIterator.hasNext()) { - throw new IllegalArgumentException("samples list must not be empty"); - } - } - - /** - * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list - * for the system. - */ - public final static Collection sampleListForSAMWithoutReadGroups() { - List samples = new ArrayList(); - samples.add(null); - return samples; - } - - public Iterator iterator() { - return this; - } - - public void close() { - //this.it.close(); - } - - public boolean hasNext() { - lazyLoadNextAlignmentContext(); - return (nextAlignmentContext != null); - //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); - } - - private GenomeLoc getLocation() { - return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // next() routine and associated collection operations - // - // ----------------------------------------------------------------------------------------------------------------- - public AlignmentContext next() { - lazyLoadNextAlignmentContext(); - if (!hasNext()) - throw new NoSuchElementException("LocusIteratorByState: out of elements."); - AlignmentContext currentAlignmentContext = nextAlignmentContext; - nextAlignmentContext = null; - return currentAlignmentContext; - } - - /** - * Creates the next alignment context from the given state. Note that this is implemented as a lazy load method. - * nextAlignmentContext MUST BE null in order for this method to advance to the next entry. - */ - private void lazyLoadNextAlignmentContext() { - while (nextAlignmentContext == null && readStates.hasNext()) { - readStates.collectPendingReads(); - - final GenomeLoc location = getLocation(); - final Map fullPileup = new HashMap(); - - // TODO: How can you determine here whether the current pileup has been downsampled? - boolean hasBeenSampled = false; - - for (final String sample : samples) { - final Iterator iterator = readStates.iterator(sample); - final List pile = new ArrayList(readStates.size(sample)); - - int size = 0; // number of elements in this sample's pileup - int nDeletions = 0; // number of deletions in this sample's pileup - int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) - - while (iterator.hasNext()) { - final SAMRecordState state = iterator.next(); // state object with the read/offset information - final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read - final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator - final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element - final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element - final boolean isSingleElementCigar = nextElement == lastElement; - final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator - final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator - int readOffset = state.getReadOffset(); // the base offset on this read - - final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; - final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; - final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; - final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar; - final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); - - int nextElementLength = nextElement.getLength(); - - if (op == CigarOperator.N) // N's are never added to any pileup - continue; - - if (op == CigarOperator.D) { - // TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix - if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so - pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1)); - size++; - nDeletions++; - if (read.getMappingQuality() == 0) - nMQ0Reads++; - } - } - else { - if (!filterBaseInRead(read, location.getStart())) { - String insertedBaseString = null; - if (nextOp == CigarOperator.I) { - final int insertionOffset = isSingleElementCigar ? 0 : 1; - // TODO -- someone please implement a better fix for the single element insertion CIGAR! - if (isSingleElementCigar) - readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases! - insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); - } - - pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); - size++; - if (read.getMappingQuality() == 0) - nMQ0Reads++; - } - } - } - - if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup - fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads)); - } - - updateReadStates(); // critical - must be called after we get the current state offsets and location - if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done - nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); - } - } - - // fast testing of position - private boolean readIsPastCurrentPosition(SAMRecord read) { - if (readStates.isEmpty()) - return false; - else { - SAMRecordState state = readStates.getFirst(); - SAMRecord ourRead = state.getRead(); - return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); - } - } - - /** - * Generic place to put per-base filters appropriate to LocusIteratorByState - * - * @param rec - * @param pos - * @return - */ - private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) { - return ReadUtils.isBaseInsideAdaptor(rec, pos); - } - - private void updateReadStates() { - for (final String sample : samples) { - Iterator it = readStates.iterator(sample); - while (it.hasNext()) { - SAMRecordState state = it.next(); - CigarOperator op = state.stepForwardOnGenome(); - if (op == null) { - // we discard the read only when we are past its end AND indel at the end of the read (if any) was - // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe - // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - it.remove(); // we've stepped off the end of the object - } - } - } - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - protected class ReadStateManager { - private final PeekableIterator iterator; - private final SamplePartitioner samplePartitioner; - private final Map readStatesBySample = new HashMap(); - private int totalReadStates = 0; - - public ReadStateManager(Iterator source) { - this.iterator = new PeekableIterator(source); - - for (final String sample : samples) { - readStatesBySample.put(sample, new PerSampleReadStateManager()); - } - - samplePartitioner = new SamplePartitioner(); - } - - /** - * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented - * for this iterator; if present, total read states will be decremented. - * - * @param sample The sample. - * @return Iterator over the reads associated with that sample. - */ - public Iterator iterator(final String sample) { - return new Iterator() { - private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public SAMRecordState next() { - return wrappedIterator.next(); - } - - public void remove() { - wrappedIterator.remove(); - } - }; - } - - public boolean isEmpty() { - return totalReadStates == 0; - } - - /** - * Retrieves the total number of reads in the manager across all samples. - * - * @return Total number of reads over all samples. - */ - public int size() { - return totalReadStates; - } - - /** - * Retrieves the total number of reads in the manager in the given sample. - * - * @param sample The sample. - * @return Total number of reads in the given sample. - */ - public int size(final String sample) { - return readStatesBySample.get(sample).size(); - } - - public SAMRecordState getFirst() { - for (final String sample : samples) { - PerSampleReadStateManager reads = readStatesBySample.get(sample); - if (!reads.isEmpty()) - return reads.peek(); - } - return null; - } - - public boolean hasNext() { - return totalReadStates > 0 || iterator.hasNext(); - } - - public void collectPendingReads() { - if (!iterator.hasNext()) - return; - - if (readStates.size() == 0) { - int firstContigIndex = iterator.peek().getReferenceIndex(); - int firstAlignmentStart = iterator.peek().getAlignmentStart(); - while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { - samplePartitioner.submitRead(iterator.next()); - } - } else { - // Fast fail in the case that the read is past the current position. - if (readIsPastCurrentPosition(iterator.peek())) - return; - - while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { - samplePartitioner.submitRead(iterator.next()); - } - } - - for (final String sample : samples) { - Collection newReads = samplePartitioner.getReadsForSample(sample); - PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); - addReadsToSample(statesBySample, newReads); - } - - samplePartitioner.reset(); - } - - /** - * Add reads with the given sample name to the given hanger entry. - * - * @param readStates The list of read states to add this collection of reads. - * @param reads Reads to add. Selected reads will be pulled from this source. - */ - private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { - if (reads.isEmpty()) - return; - - Collection newReadStates = new LinkedList(); - - for (SAMRecord read : reads) { - SAMRecordState state = new SAMRecordState(read); - state.stepForwardOnGenome(); - newReadStates.add(state); - } - - readStates.addStatesAtNextAlignmentStart(newReadStates); - } - - protected class PerSampleReadStateManager implements Iterable { - private List> readStatesByAlignmentStart = new LinkedList>(); - private int thisSampleReadStates = 0; - private Downsampler> levelingDownsampler = - performLevelingDownsampling ? - new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : - null; - - public void addStatesAtNextAlignmentStart(Collection states) { - if ( states.isEmpty() ) { - return; - } - - readStatesByAlignmentStart.add(new LinkedList(states)); - thisSampleReadStates += states.size(); - totalReadStates += states.size(); - - if ( levelingDownsampler != null ) { - levelingDownsampler.submit(readStatesByAlignmentStart); - levelingDownsampler.signalEndOfInput(); - - thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - - // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); - levelingDownsampler.reset(); - } - } - - public boolean isEmpty() { - return readStatesByAlignmentStart.isEmpty(); - } - - public SAMRecordState peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); - } - - public int size() { - return thisSampleReadStates; - } - - public Iterator iterator() { - return new Iterator() { - private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates = null; - private Iterator currentPositionReadStatesIterator = null; - - public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); - } - - public SAMRecordState next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); - } - - public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - totalReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } - } - }; - } - } - } - - /** - * Note: stores reads by sample ID string, not by sample object - */ - private class SamplePartitioner { - private Map> readsBySample; - private long readsSeen = 0; - - public SamplePartitioner() { - readsBySample = new HashMap>(); - - for ( String sample : samples ) { - readsBySample.put(sample, new ArrayList()); - } - } - - public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).add(read); - readsSeen++; - } - - public long getNumReadsSeen() { - return readsSeen; - } - - public Collection getReadsForSample(String sampleName) { - if ( ! readsBySample.containsKey(sampleName) ) - throw new NoSuchElementException("Sample name not found"); - return readsBySample.get(sampleName); - } - - public void reset() { - for ( Collection perSampleReads : readsBySample.values() ) - perSampleReads.clear(); - readsSeen = 0; - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java deleted file mode 100644 index 28348ecc2..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java +++ /dev/null @@ -1,144 +0,0 @@ -package org.broadinstitute.sting.gatk.iterators; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * Baseclass used to describe a read transformer like BAQ and BQSR - * - * Read transformers are plugable infrastructure that modify read state - * either on input, on output, or within walkers themselves. - * - * The function apply() is called on each read seen by the GATK (after passing - * all ReadFilters) and it can do as it sees fit (without modifying the alignment) - * to the read to change qualities, add tags, etc. - * - * Initialize is called once right before the GATK traversal begins providing - * the ReadTransformer with the ability to collect and initialize data from the - * engine. - * - * Note that all ReadTransformers within the classpath are created and initialized. If one - * shouldn't be run it should look at the command line options of the engine and override - * the enabled. - * - * @since 8/31/12 - * @author depristo - */ -abstract public class ReadTransformer { - /** - * When should this read transform be applied? - */ - private ApplicationTime applicationTime; - - /** - * Keep track of whether we've been initialized already, and ensure it's not called more than once. - */ - private boolean initialized = false; - - protected ReadTransformer() {} - - /** - * Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine. - * - * @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself - * @param engine the engine, for initializing values - * @param walker the walker we intend to run - */ - @Requires({"initialized == false", "engine != null", "walker != null"}) - @Ensures("initialized == true") - public final void initialize(final ApplicationTime overrideTime, final GenomeAnalysisEngine engine, final Walker walker) { - if ( engine == null ) throw new IllegalArgumentException("engine cannot be null"); - if ( walker == null ) throw new IllegalArgumentException("walker cannot be null"); - - this.applicationTime = initializeSub(engine, walker); - if ( overrideTime != null ) this.applicationTime = overrideTime; - initialized = true; - } - - /** - * Subclasses must override this to initialize themeselves - * - * @param engine the engine, for initializing values - * @param walker the walker we intend to run - * @return the point of time we'd like this read transform to be run - */ - @Requires({"engine != null", "walker != null"}) - @Ensures("result != null") - protected abstract ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker); - - /** - * Should this ReadTransformer be activated? Called after initialize, which allows this - * read transformer to look at its arguments and decide if it should be active. All - * ReadTransformers must override this, as by default they are not enabled. - * - * @return true if this ReadTransformer should be used on the read stream - */ - public boolean enabled() { - return false; - } - - /** - * Has this transformer been initialized? - * - * @return true if it has - */ - public final boolean isInitialized() { - return initialized; - } - - /** - * When should we apply this read transformer? - * - * @return true if yes - */ - public final ApplicationTime getApplicationTime() { - return applicationTime; - } - - /** - * Primary interface function for a read transform to actually do some work - * - * The function apply() is called on each read seen by the GATK (after passing - * all ReadFilters) and it can do as it sees fit (without modifying the alignment) - * to the read to change qualities, add tags, etc. - * - * @param read the read to transform - * @return the transformed read - */ - @Requires("read != null") - @Ensures("result != null") - abstract public GATKSAMRecord apply(final GATKSAMRecord read); - - @Override - public String toString() { - return getClass().getSimpleName(); - } - - /** - * When should a read transformer be applied? - */ - public static enum ApplicationTime { - /** - * Walker does not tolerate this read transformer - */ - FORBIDDEN, - - /** - * apply the transformation to the incoming reads, the default - */ - ON_INPUT, - - /** - * apply the transformation to the outgoing read stream - */ - ON_OUTPUT, - - /** - * the walker will deal with the calculation itself - */ - HANDLED_IN_WALKER - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java deleted file mode 100644 index be227619f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.broadinstitute.sting.gatk.iterators; - -import java.lang.annotation.*; - -/** - * User: hanna - * Date: May 14, 2009 - * Time: 1:51:22 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * Allows the walker to indicate what type of data it wants to consume. - */ - -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface ReadTransformersMode { - public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java deleted file mode 100644 index 314baad3d..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java +++ /dev/null @@ -1,45 +0,0 @@ -package org.broadinstitute.sting.gatk.samples; - -/** - * A class for imposing a trio structure on three samples; a common paradigm - * - * todo -- there should probably be an interface or abstract class "Pedigree" that generalizes the notion of - * -- imposing structure on samples. But given how complex pedigrees can quickly become, it's not - * -- clear the best way to do this. - */ -public class Trio { - private Sample mother; - private Sample father; - private Sample child; - - public Trio(Sample mom, Sample dad, Sample spawn) { - assert mom.getID().equals(spawn.getMaternalID()) && dad.getID().equals(spawn.getPaternalID()) : "Samples passed to trio constructor do not form a trio"; - mother = mom; - father = dad; - child = spawn; - } - - public Sample getMother() { - return mother; - } - - public String getMaternalID() { - return mother.getID(); - } - - public Sample getFather() { - return father; - } - - public String getPaternalID() { - return father.getID(); - } - - public Sample getChild() { - return child; - } - - public String getChildID() { - return child.getID(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java deleted file mode 100755 index efa2eca02..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java +++ /dev/null @@ -1,103 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public abstract class TraverseLociBase extends TraversalEngine,LocusShardDataProvider> { - /** - * our log, which we want to capture anything from this class - */ - protected static final Logger logger = Logger.getLogger(TraversalEngine.class); - - @Override - protected final String getTraversalType() { - return "sites"; - } - - protected static class TraverseResults { - final int numIterations; - final T reduceResult; - - public TraverseResults(int numIterations, T reduceResult) { - this.numIterations = numIterations; - this.reduceResult = reduceResult; - } - } - - protected abstract TraverseResults traverse( final LocusWalker walker, - final LocusView locusView, - final LocusReferenceView referenceView, - final ReferenceOrderedView referenceOrderedDataView, - final T sum); - - @Override - public T traverse( LocusWalker walker, - LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider)); - - final LocusView locusView = getLocusView( walker, dataProvider ); - - if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); - sum = result.reduceResult; - dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); - updateCumulativeMetrics(dataProvider.getShard()); - } - - // We have a final map call to execute here to clean up the skipped based from the - // last position in the ROD to that in the interval - if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { - // only do this if the walker isn't done! - final RodLocusView rodLocusView = (RodLocusView)locusView; - final long nSkipped = rodLocusView.getLastSkippedBases(); - if ( nSkipped > 0 ) { - final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - final M x = walker.map(null, null, ac); - sum = walker.reduce(x, sum); - } - } - - return sum; - } - - /** - * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype - * that comes along. - * @param walker walker to interrogate. - * @param dataProvider Data which which to drive the locus view. - * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. - */ - private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java deleted file mode 100755 index 22381092f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java +++ /dev/null @@ -1,47 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.LocusView; -import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.GenomeLoc; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public class TraverseLociLinear extends TraverseLociBase { - - @Override - protected TraverseResults traverse(LocusWalker walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) { - // We keep processing while the next reference location is within the interval - boolean done = false; - int numIterations = 0; - - while( locusView.hasNext() && ! done ) { - numIterations++; - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - final boolean keepMeP = walker.filter(tracker, refContext, locus); - if (keepMeP) { - final M x = walker.map(tracker, refContext, locus); - sum = walker.reduce(x, sum); - done = walker.isDone(); - } - - printProgress(locus.getLocation()); - } - - return new TraverseResults(numIterations, sum); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java deleted file mode 100755 index e4e2254d0..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java +++ /dev/null @@ -1,205 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.LocusView; -import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; - -import java.util.Iterator; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public class TraverseLociNano extends TraverseLociBase { - /** our log, which we want to capture anything from this class */ - private static final boolean DEBUG = false; - private static final int BUFFER_SIZE = 1000; - - final NanoScheduler nanoScheduler; - - public TraverseLociNano(int nThreads) { - nanoScheduler = new NanoScheduler(BUFFER_SIZE, nThreads); - nanoScheduler.setProgressFunction(new TraverseLociProgress()); - } - - @Override - protected TraverseResults traverse(final LocusWalker walker, - final LocusView locusView, - final LocusReferenceView referenceView, - final ReferenceOrderedView referenceOrderedDataView, - final T sum) { - nanoScheduler.setDebug(DEBUG); - final TraverseLociMap myMap = new TraverseLociMap(walker); - final TraverseLociReduce myReduce = new TraverseLociReduce(walker); - - final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); - final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); - - return new TraverseResults(inputIterator.numIterations, result); - } - - /** - * Create iterator that provides inputs for all map calls into MapData, to be provided - * to NanoScheduler for Map/Reduce - */ - private class MapDataIterator implements Iterator { - final LocusView locusView; - final LocusReferenceView referenceView; - final ReferenceOrderedView referenceOrderedDataView; - int numIterations = 0; - - private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { - this.locusView = locusView; - this.referenceView = referenceView; - this.referenceOrderedDataView = referenceOrderedDataView; - } - - @Override - public boolean hasNext() { - return locusView.hasNext(); - } - - @Override - public MapData next() { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - //logger.info("Pulling data from MapDataIterator at " + location); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext); - - numIterations++; - return new MapData(locus, refContext, tracker); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); - } - } - - @Override - public void printOnTraversalDone() { - nanoScheduler.shutdown(); - super.printOnTraversalDone(); - } - - /** - * The input data needed for each map call. The read, the reference, and the RODs - */ - private class MapData { - final AlignmentContext alignmentContext; - final ReferenceContext refContext; - final RefMetaDataTracker tracker; - - private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { - this.alignmentContext = alignmentContext; - this.refContext = refContext; - this.tracker = tracker; - } - - @Override - public String toString() { - return "MapData " + alignmentContext.getLocation(); - } - } - - /** - * Contains the results of a map call, indicating whether the call was good, filtered, or done - */ - private class MapResult { - final M value; - final boolean reduceMe; - - /** - * Create a MapResult with value that should be reduced - * - * @param value the value to reduce - */ - private MapResult(final M value) { - this.value = value; - this.reduceMe = true; - } - - /** - * Create a MapResult that shouldn't be reduced - */ - private MapResult() { - this.value = null; - this.reduceMe = false; - } - } - - /** - * A static object that tells reduce that the result of map should be skipped (filtered or done) - */ - private final MapResult SKIP_REDUCE = new MapResult(); - - /** - * MapFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Applies walker.map to MapData, returning a MapResult object containing the result - */ - private class TraverseLociMap implements NSMapFunction { - final LocusWalker walker; - - private TraverseLociMap(LocusWalker walker) { - this.walker = walker; - } - - @Override - public MapResult apply(final MapData data) { - if ( ! walker.isDone() ) { - final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); - if (keepMeP) { - final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); - return new MapResult(x); - } - } - return SKIP_REDUCE; - } - } - - /** - * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable - */ - private class TraverseLociReduce implements NSReduceFunction { - final LocusWalker walker; - - private TraverseLociReduce(LocusWalker walker) { - this.walker = walker; - } - - @Override - public T apply(MapResult one, T sum) { - if ( one.reduceMe ) - // only run reduce on values that aren't DONE or FAILED - return walker.reduce(one.value, sum); - else - return sum; - } - } - - private class TraverseLociProgress implements NSProgressFunction { - @Override - public void progress(MapData lastProcessedMap) { - if (lastProcessedMap.alignmentContext != null) - printProgress(lastProcessedMap.alignmentContext.getLocation()); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java deleted file mode 100755 index b3a0a1390..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ReadView; -import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.LinkedList; -import java.util.List; - -/** - * A nano-scheduling version of TraverseReads. - * - * Implements the traversal of a walker that accepts individual reads, the reference, and - * RODs per map call. Directly supports shared memory parallelism via NanoScheduler - * - * @author depristo - * @version 1.0 - * @date 9/2/2012 - */ -public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { - /** our log, which we want to capture anything from this class */ - protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); - private static final boolean DEBUG = false; - final NanoScheduler nanoScheduler; - - public TraverseReadsNano(int nThreads) { - final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - nanoScheduler = new NanoScheduler(bufferSize, nThreads); - } - - @Override - protected String getTraversalType() { - return "reads"; - } - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to traverse with - * @param dataProvider the provider of the reads data - * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function - * @return the reduce variable of the read walker - */ - public T traverse(ReadWalker walker, - ReadShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider)); - - if( !dataProvider.hasReads() ) - throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - - nanoScheduler.setDebug(DEBUG); - final TraverseReadsMap myMap = new TraverseReadsMap(walker); - final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - - final List aggregatedInputs = aggregateMapData(dataProvider); - final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce); - - final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; - final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); - - updateCumulativeMetrics(dataProvider.getShard()); - printProgress(locus); - - return result; - } - - /** - * Aggregate all of the inputs for all map calls into MapData, to be provided - * to NanoScheduler for Map/Reduce - * - * @param dataProvider the source of our data - * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce - * should execute - */ - private List aggregateMapData(final ReadShardDataProvider dataProvider) { - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - - final List mapData = new LinkedList(); - for ( final SAMRecord read : reads ) { - final ReferenceContext refContext = ! read.getReadUnmappedFlag() - ? reference.getReferenceContext(read) - : null; - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 - ? rodView.getReferenceOrderedDataForRead(read) - : null; - - // update the number of reads we've seen - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker)); - } - - return mapData; - } - - @Override - public void printOnTraversalDone() { - nanoScheduler.shutdown(); - super.printOnTraversalDone(); - } - - /** - * The input data needed for each map call. The read, the reference, and the RODs - */ - private class MapData { - final GATKSAMRecord read; - final ReferenceContext refContext; - final RefMetaDataTracker tracker; - - private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { - this.read = read; - this.refContext = refContext; - this.tracker = tracker; - } - } - - /** - * Contains the results of a map call, indicating whether the call was good, filtered, or done - */ - private class MapResult { - final M value; - final boolean reduceMe; - - /** - * Create a MapResult with value that should be reduced - * - * @param value the value to reduce - */ - private MapResult(final M value) { - this.value = value; - this.reduceMe = true; - } - - /** - * Create a MapResult that shouldn't be reduced - */ - private MapResult() { - this.value = null; - this.reduceMe = false; - } - } - - /** - * A static object that tells reduce that the result of map should be skipped (filtered or done) - */ - private final MapResult SKIP_REDUCE = new MapResult(); - - /** - * MapFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Applies walker.map to MapData, returning a MapResult object containing the result - */ - private class TraverseReadsMap implements NSMapFunction { - final ReadWalker walker; - - private TraverseReadsMap(ReadWalker walker) { - this.walker = walker; - } - - @Override - public MapResult apply(final MapData data) { - if ( ! walker.isDone() ) { - final boolean keepMeP = walker.filter(data.refContext, data.read); - if (keepMeP) - return new MapResult(walker.map(data.refContext, data.read, data.tracker)); - } - - return SKIP_REDUCE; - } - } - - /** - * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable - */ - private class TraverseReadsReduce implements NSReduceFunction { - final ReadWalker walker; - - private TraverseReadsReduce(ReadWalker walker) { - this.walker = walker; - } - - @Override - public T apply(MapResult one, T sum) { - if ( one.reduceMe ) - // only run reduce on values that aren't DONE or FAILED - return walker.reduce(one.value, sum); - else - return sum; - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java deleted file mode 100755 index 731ce7e4e..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package org.broadinstitute.sting.gatk.walkers; - -/** - * Root parallelism interface. Walkers that implement this - * declare that their map function is thread-safe and so multiple - * map calls can be run in parallel in the same JVM instance. - */ -public interface NanoSchedulable { -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java deleted file mode 100755 index 2b9744b89..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.fasta; - -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.Collections; -import java.util.List; - - -/** - * Generates an alternative reference sequence over the specified interval. - * - *

- * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). - * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. - * Several important notes: - * 1) if there are multiple variants that start at a site, it chooses one of them randomly. - * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. - * 3) this tool works only for SNPs and for simple indels (but not for things like complex substitutions). - * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). - * - *

Input

- *

- * The reference, requested intervals, and any number of variant rod files. - *

- * - *

Output

- *

- * A fasta file representing the requested intervals. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T FastaAlternateReferenceMaker \
- *   -o output.fasta \
- *   -L input.intervals \
- *   --variant input.vcf \
- *   [--snpmask mask.vcf]
- * 
- * - */ -@DocumentedGATKFeature( groupName = "Companion Utilities", extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-1,stop=50)) -@Requires(value={DataSource.REFERENCE}) -public class FastaAlternateReferenceMaker extends FastaReferenceMaker { - - /** - * Variants from these input files are used by this tool to construct an alternate reference. - */ - @Input(fullName = "variant", shortName = "V", doc="variants to model", required=false) - public List> variants = Collections.emptyList(); - - /** - * Snps from this file are used as a mask when constructing the alternate reference. - */ - @Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false) - public RodBinding snpmask; - - private int deletionBasesRemaining = 0; - - public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if (deletionBasesRemaining > 0) { - deletionBasesRemaining--; - return new Pair(context.getLocation(), ""); - } - - String refBase = String.valueOf((char)ref.getBase()); - - // Check to see if we have a called snp - for ( VariantContext vc : tracker.getValues(variants, ref.getLocus()) ) { - if ( vc.isFiltered() ) - continue; - - if ( vc.isSimpleDeletion()) { - deletionBasesRemaining = vc.getReference().length() - 1; - // delete the next n bases, not this one - return new Pair(context.getLocation(), refBase); - } else if ( vc.isSimpleInsertion()) { - return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); - } else if (vc.isSNP()) { - return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); - } - } - - // if we don't have a called site, and we have a mask at this site, mask it - for ( VariantContext vc : tracker.getValues(snpmask) ) { - if ( vc.isSNP()) { - return new Pair(context.getLocation(), "N"); - } - } - - - // if we got here then we're just ref - return new Pair(context.getLocation(), refBase); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java deleted file mode 100755 index 362867318..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.fasta; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; - -import java.io.PrintStream; - -/** - * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. - * - *

- * The output format can be partially controlled using the provided command-line arguments. - * Specify intervals with the usual -L argument to output only the reference bases within your intervals. - * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a - * separate fasta sequence (named numerically in order). - * - *

Input

- *

- * The reference and requested intervals. - *

- * - *

Output

- *

- * A fasta file representing the requested intervals. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T FastaReferenceMaker \
- *   -o output.fasta \
- *   -L input.intervals
- * 
- * - */ -@DocumentedGATKFeature( groupName = "Companion Utilities", extraDocs = {CommandLineGATK.class} ) -public class FastaReferenceMaker extends RefWalker, GenomeLoc> { - - @Output PrintStream out; - - @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) - public int fastaLineWidth=60; - - /** - * Please note that when using this argument adjacent intervals will automatically be merged. - */ - @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false) - public boolean fastaRawSeqs=false; - - protected FastaSequence fasta; - - public void initialize() { - if (fastaRawSeqs) fastaLineWidth = Integer.MAX_VALUE; - fasta = new FastaSequence(out, fastaLineWidth, fastaRawSeqs); - } - - public Pair map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) { - return new Pair(context.getLocation(), String.valueOf((char)ref.getBase())); - } - - public GenomeLoc reduceInit() { - return null; - } - - public GenomeLoc reduce(Pair value, GenomeLoc sum) { - if ( value == null ) - return sum; - - // if there is no interval to the left, then this is the first one - if ( sum == null ) { - sum = value.first; - fasta.append(value.second); - } - // if the intervals don't overlap, print out the leftmost one and start a new one - // (end of contig or new interval) - else if ( value.first.getStart() != sum.getStop() + 1 ) { - fasta.flush(); - sum = value.first; - fasta.append(value.second); - } - // otherwise, merge them - else { - sum = getToolkit().getGenomeLocParser().setStop(sum, value.first.getStop()); - fasta.append(value.second); - } - return sum; - } - - public void onTraversalDone(GenomeLoc sum) { - fasta.flush(); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java deleted file mode 100644 index 4589ffb71..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java +++ /dev/null @@ -1,49 +0,0 @@ -package org.broadinstitute.sting.utils.baq; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.walkers.BAQMode; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * Applies Heng's BAQ calculation to a stream of incoming reads - */ -public class BAQReadTransformer extends ReadTransformer { - private BAQ baqHMM; - private IndexedFastaSequenceFile refReader; - private BAQ.CalculationMode cmode; - private BAQ.QualityMode qmode; - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { - final BAQMode mode = WalkerManager.getWalkerAnnotation(walker, BAQMode.class); - this.refReader = engine.getReferenceDataSource().getReference(); - this.cmode = engine.getArguments().BAQMode; - this.qmode = mode.QualityMode(); - baqHMM = new BAQ(engine.getArguments().BAQGOP); - - if ( qmode == BAQ.QualityMode.DONT_MODIFY ) - throw new ReviewedStingException("BUG: shouldn't create BAQ transformer with quality mode DONT_MODIFY"); - - if ( mode.ApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN && enabled() ) - throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + cmode + " was requested."); - - return mode.ApplicationTime(); - } - - @Override - public boolean enabled() { - return cmode != BAQ.CalculationMode.OFF; - } - - @Override - public GATKSAMRecord apply(final GATKSAMRecord read) { - baqHMM.baqRead(read, refReader, cmode, qmode); - return read; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java deleted file mode 100644 index 18ab9e01a..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java +++ /dev/null @@ -1,44 +0,0 @@ -package org.broadinstitute.sting.utils.baq; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Iterator; - -/** - * Iterator that applies a ReadTransformer to a stream of reads - */ -public class ReadTransformingIterator implements StingSAMIterator { - private final StingSAMIterator it; - private final ReadTransformer transformer; - - /** - * Creates a new ReadTransforming iterator - */ - @Requires({"it != null", "transformer != null", "transformer.isInitialized()"}) - public ReadTransformingIterator(final StingSAMIterator it, final ReadTransformer transformer) { - if ( ! transformer.isInitialized() ) - throw new IllegalStateException("Creating a read transformer stream for an uninitialized read transformer: " + transformer); - if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN ) - throw new IllegalStateException("Creating a read transformer stream for a forbidden transformer " + transformer); - - this.it = it; - this.transformer = transformer; - } - - @Requires("hasNext()") - @Ensures("result != null") - public SAMRecord next() { - final GATKSAMRecord read = (GATKSAMRecord)it.next(); - return transformer.apply(read); - } - - public boolean hasNext() { return this.it.hasNext(); } - public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - public void close() { it.close(); } - public Iterator iterator() { return this; } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java deleted file mode 100644 index 2daa6c9eb..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java +++ /dev/null @@ -1,82 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import com.google.java.contract.Invariant; - -/** - * Wrapper to hold data for a blocking queue, distinguishing an EOF marker from a real object - * - * The only way to tell in a consumer thread that a blocking queue has no more data ever - * coming down the pipe is to pass in a "poison" or EOF object. This class provides - * a generic capacity for that... - * - * The use case looks like this: - * - * BlockingQueue q - * producer: - * while ( x has items ) - * q.put(new BlockingQueueValue(x)) - * q.put(new BlockingQueueValue()) - * - * Consumer: - * while ( true ) - * value = q.take() - * if ( value.isLast() ) - * break - * else - * do something useful with value - * - * - * User: depristo - * Date: 9/6/12 - * Time: 3:08 PM - */ -@Invariant("! isLast || value == null") -class BlockingQueueValue { - /** - * True if this is the EOF marker object - */ - final private boolean isLast; - - /** - * Our value, if we aren't the EOF marker - */ - final private T value; - - /** - * Create a new BlockingQueueValue containing a real value, where last is false - * @param value - */ - BlockingQueueValue(final T value) { - isLast = false; - this.value = value; - } - - /** - * Create a new BlockingQueueValue that is the last item - */ - BlockingQueueValue() { - isLast = true; - this.value = null; - } - - /** - * Is this the EOF marker? - * - * @return true if so, else false - */ - public boolean isLast() { - return isLast; - } - - /** - * Get the value held by this BlockingQueueValue - * - * @return the value - * @throws IllegalStateException if this is the last item - */ - public T getValue() { - if ( isLast() ) - throw new IllegalStateException("Cannot get value for last object"); - return value; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java deleted file mode 100644 index 9508a15aa..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java +++ /dev/null @@ -1,45 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import java.util.concurrent.ExecutionException; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; - -/** - * Create a future that simply returns a given value - * - * The only standard way to create a future in java is via the ExecutorService interface. - * If you have a data structure holding futures of value T, and you want to add a - * value to it for some reason (to add a EOF marker, for instance) you can use this - * class to create a dummy Future that simply returns a value. - * - * @author depristo - * @since 09/12 - */ -class FutureValue implements Future { - final V value; - - FutureValue(final V value) { - this.value = value; - } - - @Override public boolean cancel(boolean mayInterruptIfRunning) { - return true; - } - - @Override public boolean isCancelled() { - return false; - } - - @Override public boolean isDone() { - return true; - } - - @Override public V get() throws InterruptedException, ExecutionException { - return value; - } - - @Override public V get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException { - return get(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java deleted file mode 100644 index 29dddbc49..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java +++ /dev/null @@ -1,62 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Iterator; -import java.util.concurrent.BlockingQueue; - -/** - * Producer Thread that reads input values from an inputReads and puts them into a BlockingQueue - */ -class InputProducer implements Runnable { - /** - * The iterator we are using to get data from - */ - final Iterator inputReader; - - /** - * Our timer (may be null) that we use to track our input costs - */ - final SimpleTimer inputTimer; - - /** - * Where we put our input values for consumption - */ - final BlockingQueue outputQueue; - - public InputProducer(final Iterator inputReader, - final SimpleTimer inputTimer, - final BlockingQueue outputQueue) { - if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); - if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null"); - - this.inputReader = inputReader; - this.inputTimer = inputTimer; - this.outputQueue = outputQueue; - } - - public void run() { - try { - while ( inputReader.hasNext() ) { - if ( inputTimer != null ) inputTimer.restart(); - final InputType input = inputReader.next(); - if ( inputTimer != null ) inputTimer.stop(); - outputQueue.put(new InputValue(input)); - } - - // add the EOF object so our consumer knows we are done in all inputs - outputQueue.put(new InputValue()); - } catch (InterruptedException ex) { - throw new ReviewedStingException("got execution exception", ex); - } - } - - /** - * Helper class that contains a read value suitable for EOF marking in a BlockingQueue - */ - class InputValue extends BlockingQueueValue { - private InputValue(InputType datum) { super(datum); } - private InputValue() { } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java deleted file mode 100644 index 3cc6fa786..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java +++ /dev/null @@ -1,36 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -/** - * Holds the results of a map job suitable for producer/consumer threading - * via a BlockingQueue - */ -class MapResult extends BlockingQueueValue { - final int jobID; - - /** - * Create a new MapResult with value datum and jod jobID ID - * - * @param datum the value produced by the map job - * @param jobID the id of the map job (for correctness testing) - */ - MapResult(final MapType datum, final int jobID) { - super(datum); - this.jobID = jobID; - if ( jobID < 0 ) throw new IllegalArgumentException("JobID must be >= 0"); - } - - /** - * Create the EOF marker version of MapResult - */ - MapResult() { - super(); - this.jobID = Integer.MAX_VALUE; - } - - /** - * @return the job ID of the map job that produced this MapResult - */ - public int getJobID() { - return jobID; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java deleted file mode 100644 index cc5335051..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java +++ /dev/null @@ -1,19 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -/** - * A function that maps from InputType -> ResultType - * - * For use with the NanoScheduler - * - * User: depristo - * Date: 8/24/12 - * Time: 9:49 AM - */ -public interface NSMapFunction { - /** - * Return function on input, returning a value of ResultType - * @param input - * @return - */ - public ResultType apply(final InputType input); -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java deleted file mode 100644 index 8b12c62c4..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java +++ /dev/null @@ -1,12 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -/** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 9/4/12 - * Time: 2:10 PM - * To change this template use File | Settings | File Templates. - */ -public interface NSProgressFunction { - public void progress(final InputType lastMapInput); -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java deleted file mode 100644 index 879a33a1d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java +++ /dev/null @@ -1,18 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -/** - * A function that combines a value of MapType with an existing ReduceValue into a new ResultType - * - * User: depristo - * Date: 8/24/12 - * Time: 9:49 AM - */ -public interface NSReduceFunction { - /** - * Combine one with sum into a new ReduceType - * @param one the result of a map call on an input element - * @param sum the cumulative reduce result over all previous map calls - * @return - */ - public ReduceType apply(MapType one, ReduceType sum); -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java deleted file mode 100644 index 664fb7b9b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ /dev/null @@ -1,392 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.AutoFormattingTime; -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.threading.NamedThreadFactory; - -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.*; - -/** - * Framework for very fine grained MapReduce parallelism - * - * The overall framework works like this - * - * nano <- new Nanoschedule(inputBufferSize, numberOfMapElementsToProcessTogether, nThreads) - * List[Input] outerData : outerDataLoop ) - * result = nano.execute(outerData.iterator(), map, reduce) - * - * inputBufferSize determines how many elements from the input stream are read in one go by the - * nanoscheduler. The scheduler may hold up to inputBufferSize in memory at one time, as well - * as up to inputBufferSize map results as well. - * - * numberOfMapElementsToProcessTogether determines how many input elements are processed - * together each thread cycle. For example, if this value is 10, then the input data - * is grouped together in units of 10 elements each, and map called on each in term. The more - * heavy-weight the map function is, in terms of CPU costs, the more it makes sense to - * have this number be small. The lighter the CPU cost per element, though, the more this - * parameter introduces overhead due to need to context switch among threads to process - * each input element. A value of -1 lets the nanoscheduler guess at a reasonable trade-off value. - * - * nThreads is a bit obvious yes? Note though that the nanoscheduler assumes that it gets 1 thread - * from its client during the execute call, as this call blocks until all work is done. The caller - * thread is put to work by execute to help with the processing of the data. So in reality the - * nanoScheduler only spawn nThreads - 1 additional workers (if this is > 1). - * - * User: depristo - * Date: 8/24/12 - * Time: 9:47 AM - */ -public class NanoScheduler { - private final static Logger logger = Logger.getLogger(NanoScheduler.class); - private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; - private final static boolean LOG_MAP_TIMES = false; - private final static boolean TIME_CALLS = true; - - private final static int MAP_BUFFER_SIZE_SCALE_FACTOR = 100; - - final int inputBufferSize; - final int mapBufferSize; - final int nThreads; - final ExecutorService inputExecutor; - final ExecutorService reduceExecutor; - final ThreadPoolExecutor mapExecutor; - - boolean shutdown = false; - boolean debug = false; - private NSProgressFunction progressFunction = null; - - final SimpleTimer outsideSchedulerTimer = TIME_CALLS ? new SimpleTimer("outside") : null; - final SimpleTimer inputTimer = TIME_CALLS ? new SimpleTimer("input") : null; - final SimpleTimer mapTimer = TIME_CALLS ? new SimpleTimer("map") : null; - final SimpleTimer reduceTimer = TIME_CALLS ? new SimpleTimer("reduce") : null; - - /** - * Create a new nanoscheduler with the desire characteristics requested by the argument - * - * @param inputBufferSize the number of input elements to read in each scheduling cycle. - * @param nThreads the number of threads to use to get work done, in addition to the - * thread calling execute - */ - public NanoScheduler(final int inputBufferSize, final int nThreads) { - if ( inputBufferSize < 1 ) throw new IllegalArgumentException("inputBufferSize must be >= 1, got " + inputBufferSize); - if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); - - this.inputBufferSize = inputBufferSize; - this.mapBufferSize = inputBufferSize * MAP_BUFFER_SIZE_SCALE_FACTOR; - this.nThreads = nThreads; - - if ( nThreads == 1 ) { - this.mapExecutor = null; - this.inputExecutor = this.reduceExecutor = null; - } else { - this.mapExecutor = (ThreadPoolExecutor)Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d")); - this.mapExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); - this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); - this.reduceExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-reduce-thread-%d")); - } - - // start timing the time spent outside of the nanoScheduler - outsideSchedulerTimer.start(); - } - - /** - * The number of parallel map threads in use with this NanoScheduler - * @return - */ - @Ensures("result > 0") - public int getnThreads() { - return nThreads; - } - - /** - * The input buffer size used by this NanoScheduler - * @return - */ - @Ensures("result > 0") - public int getInputBufferSize() { - return inputBufferSize; - } - - /** - * Tells this nanoScheduler to shutdown immediately, releasing all its resources. - * - * After this call, execute cannot be invoked without throwing an error - */ - public void shutdown() { - outsideSchedulerTimer.stop(); - - if ( nThreads > 1 ) { - shutdownExecutor("inputExecutor", inputExecutor); - shutdownExecutor("mapExecutor", mapExecutor); - shutdownExecutor("reduceExecutor", reduceExecutor); - } - shutdown = true; - - if (TIME_CALLS) { - printTimerInfo("Input time", inputTimer); - printTimerInfo("Map time", mapTimer); - printTimerInfo("Reduce time", reduceTimer); - printTimerInfo("Outside time", outsideSchedulerTimer); - } - } - - /** - * Helper function to cleanly shutdown an execution service, checking that the execution - * state is clean when it's done. - * - * @param name a string name for error messages for the executorService we are shutting down - * @param executorService the executorService to shut down - */ - @Requires({"name != null", "executorService != null"}) - @Ensures("executorService.isShutdown()") - private void shutdownExecutor(final String name, final ExecutorService executorService) { - if ( executorService.isShutdown() || executorService.isTerminated() ) - throw new IllegalStateException("Executor service " + name + " is already shut down!"); - - final List remaining = executorService.shutdownNow(); - if ( ! remaining.isEmpty() ) - throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); - } - - /** - * Print to logger.info timing information from timer, with name label - * - * @param label the name of the timer to display. Should be human readable - * @param timer the timer whose elapsed time we will display - */ - @Requires({"label != null", "timer != null"}) - private void printTimerInfo(final String label, final SimpleTimer timer) { - final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() - + reduceTimer.getElapsedTime() + outsideSchedulerTimer.getElapsedTime(); - final double myTimeInSec = timer.getElapsedTime(); - final double myTimePercent = myTimeInSec / total * 100; - logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); - } - - /** - * @return true if this nanoScheduler is shutdown, or false if its still open for business - */ - public boolean isShutdown() { - return shutdown; - } - - /** - * @return are we displaying verbose debugging information about the scheduling? - */ - public boolean isDebug() { - return debug; - } - - /** - * Helper function to display a String.formatted message if we are doing verbose debugging - * - * @param format the format argument suitable for String.format - * @param args the arguments for String.format - */ - @Requires("format != null") - private void debugPrint(final String format, Object ... args) { - if ( isDebug() ) - logger.info("Thread " + Thread.currentThread().getId() + ":" + String.format(format, args)); - } - - /** - * Turn on/off verbose debugging - * - * @param debug true if we want verbose debugging - */ - public void setDebug(boolean debug) { - this.debug = debug; - } - - /** - * Set the progress callback function to progressFunction - * - * The progress callback is invoked after each buffer size elements have been processed by map/reduce - * - * @param progressFunction a progress function to call, or null if you don't want any progress callback - */ - public void setProgressFunction(final NSProgressFunction progressFunction) { - this.progressFunction = progressFunction; - } - - /** - * Execute a map/reduce job with this nanoScheduler - * - * Data comes from inputReader. Will be read until hasNext() == false. - * map is called on each element provided by inputReader. No order of operations is guarenteed - * reduce is called in order of the input data provided by inputReader on the result of map() applied - * to each element. - * - * Note that the caller thread is put to work with this function call. The call doesn't return - * until all elements have been processes. - * - * It is safe to call this function repeatedly on a single nanoScheduler, at least until the - * shutdown method is called. - * - * Note that this function goes through a single threaded fast path if the number of threads - * is 1. - * - * @param inputReader an iterator providing us with the input data to nanoSchedule map/reduce over - * @param map the map function from input type -> map type, will be applied in parallel to each input - * @param reduce the reduce function from map type + reduce type -> reduce type to be applied in order to map results - * @return the last reduce value - */ - public ReduceType execute(final Iterator inputReader, - final NSMapFunction map, - final ReduceType initialValue, - final NSReduceFunction reduce) { - if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler"); - if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); - if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); - if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); - - outsideSchedulerTimer.stop(); - - ReduceType result; - if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { - result = executeSingleThreaded(inputReader, map, initialValue, reduce); - } else { - result = executeMultiThreaded(inputReader, map, initialValue, reduce); - } - - outsideSchedulerTimer.restart(); - return result; - } - - /** - * Simple efficient reference implementation for single threaded execution. - * - * @return the reduce result of this map/reduce job - */ - @Requires({"inputReader != null", "map != null", "reduce != null"}) - private ReduceType executeSingleThreaded(final Iterator inputReader, - final NSMapFunction map, - final ReduceType initialValue, - final NSReduceFunction reduce) { - ReduceType sum = initialValue; - int i = 0; - - // start timer to ensure that both hasNext and next are caught by the timer - if ( TIME_CALLS ) inputTimer.restart(); - while ( inputReader.hasNext() ) { - final InputType input = inputReader.next(); - if ( TIME_CALLS ) inputTimer.stop(); - - // map - if ( TIME_CALLS ) mapTimer.restart(); - final long preMapTime = LOG_MAP_TIMES ? 0 : mapTimer.currentTimeNano(); - final MapType mapValue = map.apply(input); - if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (mapTimer.currentTimeNano() - preMapTime)); - if ( TIME_CALLS ) mapTimer.stop(); - - if ( i++ % inputBufferSize == 0 && progressFunction != null ) - progressFunction.progress(input); - - // reduce - if ( TIME_CALLS ) reduceTimer.restart(); - sum = reduce.apply(mapValue, sum); - if ( TIME_CALLS ) reduceTimer.stop(); - - if ( TIME_CALLS ) inputTimer.restart(); - } - - return sum; - } - - /** - * Efficient parallel version of Map/Reduce - * - * @return the reduce result of this map/reduce job - */ - @Requires({"inputReader != null", "map != null", "reduce != null"}) - private ReduceType executeMultiThreaded(final Iterator inputReader, - final NSMapFunction map, - final ReduceType initialValue, - final NSReduceFunction reduce) { - debugPrint("Executing nanoScheduler"); - - // a blocking queue that limits the number of input datum to the requested buffer size - final BlockingQueue.InputValue> inputQueue - = new LinkedBlockingDeque.InputValue>(inputBufferSize); - - // a priority queue that stores up to mapBufferSize elements - // produced by completed map jobs. - final BlockingQueue>> mapResultQueue = - new LinkedBlockingDeque>>(mapBufferSize); - - // Start running the input reader thread - inputExecutor.submit(new InputProducer(inputReader, inputTimer, inputQueue)); - - // Start running the reducer thread - final ReducerThread reducer - = new ReducerThread(reduce, reduceTimer, initialValue, mapResultQueue); - final Future reduceResult = reduceExecutor.submit(reducer); - - try { - int numJobs = 0; - - while ( true ) { - // block on input - final InputProducer.InputValue inputEnqueueWrapped = inputQueue.take(); - - if ( ! inputEnqueueWrapped.isLast() ) { - // get the object itself - final InputType input = inputEnqueueWrapped.getValue(); - - // the next map call has jobID + 1 - numJobs++; - - // send job for map via the completion service - final CallableMap doMap = new CallableMap(map, numJobs, input); - final Future> mapJob = mapExecutor.submit(doMap); - mapResultQueue.put(mapJob); - - debugPrint(" Done with cycle of map/reduce"); - - if ( numJobs % inputBufferSize == 0 && progressFunction != null ) - progressFunction.progress(input); - } else { - mapResultQueue.put(new FutureValue>(new MapResult())); - return reduceResult.get(); // wait for our result of reduce - } - } - } catch (InterruptedException ex) { - throw new ReviewedStingException("got execution exception", ex); - } catch (ExecutionException ex) { - throw new ReviewedStingException("got execution exception", ex); - } - } - - /** - * A simple callable version of the map function for use with the executor pool - */ - private class CallableMap implements Callable> { - final int id; - final InputType input; - final NSMapFunction map; - - @Requires({"map != null"}) - private CallableMap(final NSMapFunction map, - final int id, - final InputType input) { - this.id = id; - this.input = input; - this.map = map; - } - - @Override - public MapResult call() { - if ( TIME_CALLS ) mapTimer.restart(); - if ( debug ) debugPrint("\t\tmap " + input); - final MapType result = map.apply(input); - if ( TIME_CALLS ) mapTimer.stop(); - return new MapResult(result, id); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java deleted file mode 100644 index 506e45453..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.Future; - -/** - * Thread that runs the reduce of the map/reduce. - * - * This thread reads from mapResultsQueue until the poison EOF object arrives. At each - * stage is calls reduce(value, sum). The blocking mapResultQueue ensures that the - * queue waits until the mapResultQueue has a value to take. Then, it gets and waits - * until the map result Future has a value. - */ -class ReducerThread implements Callable { - final NSReduceFunction reduce; - final SimpleTimer reduceTimer; - final BlockingQueue>> mapResultQueue; - - ReduceType sum; - int lastJobID = -1; - - public ReducerThread(final NSReduceFunction reduce, - final SimpleTimer reduceTimer, - final ReduceType sum, - final BlockingQueue>> mapResultQueue) { - if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null"); - if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null"); - - this.reduce = reduce; - this.reduceTimer = reduceTimer; - this.sum = sum; - this.mapResultQueue = mapResultQueue; - } - - public ReduceType call() { - try { - while ( true ) { - final MapResult result = mapResultQueue.take().get(); - if ( result.isLast() ) { - // we are done, just return sum - return sum; - } - else if ( result.getJobID() < lastJobID ) { - // make sure the map results are coming in order - throw new IllegalStateException("BUG: last jobID " + lastJobID + " > current jobID " + result.getJobID()); - } else { - lastJobID = result.getJobID(); - // apply reduce, keeping track of sum - if ( reduceTimer != null ) reduceTimer.restart(); - sum = reduce.apply(result.getValue(), sum); - if ( reduceTimer != null ) reduceTimer.stop(); - } - } - } catch (ExecutionException ex) { - throw new ReviewedStingException("got execution exception", ex); - } catch (InterruptedException ex) { - throw new ReviewedStingException("got execution exception", ex); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java deleted file mode 100644 index 431014032..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java +++ /dev/null @@ -1,30 +0,0 @@ -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; - -import java.lang.annotation.*; - -/** - * User: hanna - * Date: May 14, 2009 - * Time: 1:51:22 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * Allows the walker to indicate what type of data it wants to consume. - */ - -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface BQSRMode { - public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java deleted file mode 100644 index fae0e8c09..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java +++ /dev/null @@ -1,40 +0,0 @@ -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * A ReadTransformer that applies BQSR on the fly to reads - * - * User: rpoplin - * Date: 2/13/12 - */ -public class BQSRReadTransformer extends ReadTransformer { - private boolean enabled; - private BaseRecalibration bqsr; - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { - this.enabled = engine.hasBaseRecalibration(); - this.bqsr = engine.getBaseRecalibration(); - final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class); - return mode.ApplicationTime(); - } - - @Override - public boolean enabled() { - return enabled; - } - - /** - * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads. - */ - @Override - public GATKSAMRecord apply(GATKSAMRecord read) { - bqsr.recalibrateRead(read); - return read; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java deleted file mode 100644 index 7c2d9bfdc..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.picard.sam.MergingSamRecordIterator; -import net.sf.picard.sam.SamFileHeaderMerger; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Simple wrapper class that multiplexes multiple ArtificialSingleSampleReadStreams into a single stream of reads - * - * @author David Roazen - */ -public class ArtificialMultiSampleReadStream implements Iterable { - - private Collection perSampleArtificialReadStreams; - private MergingSamRecordIterator mergingIterator; - - public ArtificialMultiSampleReadStream( Collection perSampleArtificialReadStreams ) { - if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) { - throw new ReviewedStingException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams"); - } - - this.perSampleArtificialReadStreams = perSampleArtificialReadStreams; - } - - public Iterator iterator() { - // lazy initialization to prevent reads from being created until they're needed - initialize(); - - return mergingIterator; - } - - public StingSAMIterator getStingSAMIterator() { - // lazy initialization to prevent reads from being created until they're needed - initialize(); - - return StingSAMIteratorAdapter.adapt(mergingIterator); - } - - private void initialize() { - Collection perSampleSAMReaders = new ArrayList(perSampleArtificialReadStreams.size()); - Collection headers = new ArrayList(perSampleArtificialReadStreams.size()); - - for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) { - Collection thisStreamReads = readStream.makeReads(); - - SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(), - thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()])); - perSampleSAMReaders.add(reader); - headers.add(reader.getFileHeader()); - } - - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); - mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java deleted file mode 100644 index a9480692b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; - -/** - * An artificial stream of reads from a single read group/sample with configurable characteristics - * such as: - * - * -the number of contigs that the reads should be distributed across - * -number of "stacks" of reads sharing the same alignment start position per contig - * -the min/max number of reads in each stack (exact values chosen randomly from this range) - * -the min/max distance between stack start positions (exact values chosen randomly from this range) - * -the min/max length of each read (exact values chosen randomly from this range) - * -the number of unmapped reads - * - * The cigar string for all reads will be *M, where * is the length of the read. - * - * @author David Roazen - */ -public class ArtificialSingleSampleReadStream implements Iterable { - private SAMFileHeader header; - private String readGroupID; - private int numContigs; - private int numStacksPerContig; - private int minReadsPerStack; - private int maxReadsPerStack; - private int minDistanceBetweenStacks; - private int maxDistanceBetweenStacks; - private int minReadLength; - private int maxReadLength; - private int numUnmappedReads; - - private static final String READ_GROUP_TAG = "RG"; - - public ArtificialSingleSampleReadStream( SAMFileHeader header, - String readGroupID, - int numContigs, - int numStacksPerContig, - int minReadsPerStack, - int maxReadsPerStack, - int minDistanceBetweenStacks, - int maxDistanceBetweenStacks, - int minReadLength, - int maxReadLength, - int numUnmappedReads ) { - this.header = header; - this.readGroupID = readGroupID; - this.numContigs = numContigs; - this.numStacksPerContig = numStacksPerContig; - this.minReadsPerStack = minReadsPerStack; - this.maxReadsPerStack = maxReadsPerStack; - this.minDistanceBetweenStacks = minDistanceBetweenStacks; - this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; - this.minReadLength = minReadLength; - this.maxReadLength = maxReadLength; - this.numUnmappedReads = numUnmappedReads; - - validateStreamParameters(); - } - - private void validateStreamParameters() { - if ( header == null || readGroupID == null ) { - throw new ReviewedStingException("null SAMFileHeader or read group ID") ; - } - - if ( header.getReadGroup(readGroupID) == null ) { - throw new ReviewedStingException(String.format("Read group %s not found in SAMFileHeader", readGroupID)); - } - - if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 || - minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 || - numUnmappedReads < 0 ) { - throw new ReviewedStingException("Read stream parameters must be >= 0"); - } - - if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) { - throw new ReviewedStingException("numContigs and numStacksPerContig must either both be > 0, or both be 0"); - } - - if ( minReadsPerStack > maxReadsPerStack ) { - throw new ReviewedStingException("minReadsPerStack > maxReadsPerStack"); - } - - if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) { - throw new ReviewedStingException("minDistanceBetweenStacks > maxDistanceBetweenStacks"); - } - - if ( minReadLength > maxReadLength ) { - throw new ReviewedStingException("minReadLength > maxReadLength"); - } - } - - public Iterator iterator() { - return makeReads().iterator(); - } - - public StingSAMIterator getStingSAMIterator() { - return StingSAMIteratorAdapter.adapt(iterator()); - } - - public Collection makeReads() { - Collection reads = new ArrayList(numContigs * numStacksPerContig * maxReadsPerStack); - - for ( int contig = 0; contig < numContigs; contig++ ) { - int alignmentStart = 1; - - for ( int stack = 0; stack < numStacksPerContig; stack++ ) { - reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack))); - alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks); - } - } - - if ( numUnmappedReads > 0 ) { - reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads)); - } - - return reads; - } - - private Collection makeReadStack( int contig, int alignmentStart, int stackSize ) { - Collection readStack = new ArrayList(stackSize); - - for ( int i = 0; i < stackSize; i++ ) { - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, - "foo", - contig, - alignmentStart, - MathUtils.randomIntegerInRange(minReadLength, maxReadLength)); - read.setAttribute(READ_GROUP_TAG, readGroupID); - readStack.add(read); - } - - return readStack; - } - - public SAMFileHeader getHeader() { - return header; - } - - public String getReadGroupID() { - return readGroupID; - } - - public int getNumContigs() { - return numContigs; - } - - public int getNumStacksPerContig() { - return numStacksPerContig; - } - - public int getMinReadsPerStack() { - return minReadsPerStack; - } - - public int getMaxReadsPerStack() { - return maxReadsPerStack; - } - - public int getMinDistanceBetweenStacks() { - return minDistanceBetweenStacks; - } - - public int getMaxDistanceBetweenStacks() { - return maxDistanceBetweenStacks; - } - - public int getMinReadLength() { - return minReadLength; - } - - public int getMaxReadLength() { - return maxReadLength; - } - - public int getNumUnmappedReads() { - return numUnmappedReads; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java deleted file mode 100644 index a4d7c5146..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.ArrayList; -import java.util.List; - -/** - * A class for analyzing and validating the read stream produced by an ArtificialSingleSampleReadStream. - * - * Collects various statistics about the stream of reads it's fed, and validates the stream - * by checking whether the collected statistics match the nominal properties of the stream. - * - * Subclasses are expected to override the validate() method in order to check whether an artificial - * read stream has been *transformed* in some way (eg., by downsampling or some other process), rather - * than merely checking whether the stream matches its original properties. - * - * Usage is simple: - * - * ArtificialSingleSampleReadStreamAnalyzer analyzer = new ArtificialSingleSampleReadStreamAnalyzer(originalStream); - * analyzer.analyze(originalOrTransformedStream); - * analyzer.validate(); // override this method if you want to check whether the stream has been transformed - * // in a certain way relative to the original stream - * - * @author David Roazen - */ -public class ArtificialSingleSampleReadStreamAnalyzer { - protected ArtificialSingleSampleReadStream originalStream; - protected SAMRecord lastRead; - protected int totalReads; - protected boolean allSamplesMatch; - protected int numContigs; - protected List stacksPerContig; - protected Integer minReadsPerStack; - protected Integer maxReadsPerStack; - protected Integer minDistanceBetweenStacks; - protected Integer maxDistanceBetweenStacks; - protected Integer minReadLength; - protected Integer maxReadLength; - protected int numUnmappedReads; - - protected int currentContigNumStacks; - protected int currentStackNumReads; - - /** - * Construct a new read stream analyzer, providing an ArtificialSingleSampleReadStream that will - * serve as the basis for comparison after the analysis is complete. - * - * @param originalStream the original ArtificialSingleSampleReadStream upon which the stream - * that will be fed to the analyzer is based - */ - public ArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream ) { - this.originalStream = originalStream; - reset(); - } - - /** - * Reset all read stream statistics collected by this analyzer to prepare for a fresh run - */ - public void reset() { - lastRead = null; - totalReads = 0; - allSamplesMatch = true; - numContigs = 0; - stacksPerContig = new ArrayList(); - minReadsPerStack = null; - maxReadsPerStack = null; - minDistanceBetweenStacks = null; - maxDistanceBetweenStacks = null; - minReadLength = null; - maxReadLength = null; - numUnmappedReads = 0; - currentContigNumStacks = 0; - currentStackNumReads = 0; - } - - /** - * Collect statistics on the stream of reads passed in - * - * @param stream the stream of reads to analyze - */ - public void analyze( Iterable stream ) { - for ( SAMRecord read : stream ) { - update(read); - } - finalizeStats(); - } - - /** - * Validate the stream by checking whether our collected statistics match the properties of the - * original stream. Throws a ReviewedStingException if the stream is invalid. - * - * Override this method if you want to check whether the stream has been transformed in some - * way relative to the original stream. - */ - public void validate() { - if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { - if ( totalReads != 0 ) { - throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); - } - return; // no further validation needed for the 0-reads case - } - else if ( totalReads == 0 ) { - throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); - } - - if ( ! allSamplesMatch ) { - throw new ReviewedStingException("some reads had the wrong sample"); - } - - if ( numContigs != originalStream.getNumContigs() ) { - throw new ReviewedStingException("number of contigs not correct"); - } - - if ( stacksPerContig.size() != originalStream.getNumContigs() ) { - throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", - stacksPerContig.size(), originalStream.getNumContigs())); - } - - for ( int contigStackCount : stacksPerContig ) { - if ( contigStackCount != originalStream.getNumStacksPerContig() ) { - throw new ReviewedStingException("contig had incorrect number of stacks"); - } - } - - if ( originalStream.getNumStacksPerContig() > 0 ) { - if ( minReadsPerStack < originalStream.getMinReadsPerStack() ) { - throw new ReviewedStingException("stack had fewer than the minimum number of reads"); - } - if ( maxReadsPerStack > originalStream.getMaxReadsPerStack() ) { - throw new ReviewedStingException("stack had more than the maximum number of reads"); - } - } - else if ( minReadsPerStack != null || maxReadsPerStack != null ) { - throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); - } - - if ( originalStream.getNumStacksPerContig() > 1 ) { - if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { - throw new ReviewedStingException("stacks were separated by less than the minimum distance"); - } - if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { - throw new ReviewedStingException("stacks were separated by more than the maximum distance"); - } - } - else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { - throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); - } - - if ( minReadLength < originalStream.getMinReadLength() ) { - throw new ReviewedStingException("read was shorter than the minimum allowed length"); - } - if ( maxReadLength > originalStream.getMaxReadLength() ) { - throw new ReviewedStingException("read was longer than the maximum allowed length"); - } - - if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { - throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", - originalStream.getNumUnmappedReads(), numUnmappedReads)); - } - - if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && - numUnmappedReads != totalReads ) { - throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); - } - } - - public void update( SAMRecord read ) { - if ( read.getReadUnmappedFlag() ) { - numUnmappedReads++; - - if ( numUnmappedReads == 1 && lastRead != null ) { - processContigChange(); - numContigs--; - } - } - else if ( lastRead == null ) { - numContigs = 1; - currentContigNumStacks = 1; - currentStackNumReads = 1; - } - else if ( ! read.getReferenceIndex().equals(lastRead.getReferenceIndex()) ) { - processContigChange(); - } - else if ( read.getAlignmentStart() != lastRead.getAlignmentStart() ) { - processStackChangeWithinContig(read); - } - else { - currentStackNumReads++; - } - - updateReadLength(read.getReadLength()); - allSamplesMatch = allSamplesMatch && readHasCorrectSample(read); - totalReads++; - - lastRead = read; - } - - - private void processContigChange() { - numContigs++; - - stacksPerContig.add(currentContigNumStacks); - currentContigNumStacks = 1; - - updateReadsPerStack(currentStackNumReads); - currentStackNumReads = 1; - } - - private void processStackChangeWithinContig( SAMRecord read ) { - currentContigNumStacks++; - - updateReadsPerStack(currentStackNumReads); - currentStackNumReads = 1; - - updateDistanceBetweenStacks(read.getAlignmentStart() - lastRead.getAlignmentStart()); - } - - private void updateReadsPerStack( int stackReadCount ) { - if ( minReadsPerStack == null || stackReadCount < minReadsPerStack ) { - minReadsPerStack = stackReadCount; - } - if ( maxReadsPerStack == null || stackReadCount > maxReadsPerStack ) { - maxReadsPerStack = stackReadCount; - } - } - - private void updateDistanceBetweenStacks( int stackDistance ) { - if ( minDistanceBetweenStacks == null || stackDistance < minDistanceBetweenStacks ) { - minDistanceBetweenStacks = stackDistance; - } - if ( maxDistanceBetweenStacks == null || stackDistance > maxDistanceBetweenStacks ) { - maxDistanceBetweenStacks = stackDistance; - } - } - - private void updateReadLength( int readLength ) { - if ( minReadLength == null || readLength < minReadLength ) { - minReadLength = readLength; - } - if ( maxReadLength == null || readLength > maxReadLength ) { - maxReadLength = readLength; - } - } - - private boolean readHasCorrectSample( SAMRecord read ) { - return originalStream.getReadGroupID().equals(read.getAttribute("RG")); - } - - public void finalizeStats() { - if ( lastRead != null && ! lastRead.getReadUnmappedFlag() ) { - stacksPerContig.add(currentContigNumStacks); - updateReadsPerStack(currentStackNumReads); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java deleted file mode 100644 index b30198608..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package org.broadinstitute.sting.utils.threading; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.apache.log4j.Priority; -import org.broadinstitute.sting.utils.AutoFormattingTime; - -import java.lang.management.ManagementFactory; -import java.lang.management.ThreadInfo; -import java.lang.management.ThreadMXBean; -import java.util.ArrayList; -import java.util.EnumMap; -import java.util.List; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ThreadFactory; -import java.util.concurrent.TimeUnit; - -/** - * Creates threads that automatically monitor their efficiency via the parent ThreadEfficiencyMonitor - * - * User: depristo - * Date: 8/14/12 - * Time: 8:47 AM - */ -@Invariant({ - "activeThreads.size() <= nThreadsToCreate", - "countDownLatch.getCount() <= nThreadsToCreate", - "nThreadsCreated <= nThreadsToCreate" -}) -public class EfficiencyMonitoringThreadFactory extends ThreadEfficiencyMonitor implements ThreadFactory { - final int nThreadsToCreate; - final List activeThreads; - - int nThreadsCreated = 0; - - /** - * Counts down the number of active activeThreads whose runtime info hasn't been incorporated into - * times. Counts down from nThreadsToCreate to 0, at which point any code waiting - * on the final times is freed to run. - */ - final CountDownLatch countDownLatch; - - /** - * Create a new factory generating threads whose runtime and contention - * behavior is tracked in this factory. - * - * @param nThreadsToCreate the number of threads we will create in the factory before it's considered complete - */ - public EfficiencyMonitoringThreadFactory(final int nThreadsToCreate) { - super(); - if ( nThreadsToCreate <= 0 ) throw new IllegalArgumentException("nThreadsToCreate <= 0: " + nThreadsToCreate); - - this.nThreadsToCreate = nThreadsToCreate; - activeThreads = new ArrayList(nThreadsToCreate); - countDownLatch = new CountDownLatch(nThreadsToCreate); - } - - /** - * How many threads have been created by this factory so far? - * @return - */ - @Ensures("result >= 0") - public int getNThreadsCreated() { - return nThreadsCreated; - } - - /** - * Only useful for testing, so that we can wait for all of the threads in the factory to complete running - * - * @throws InterruptedException - */ - protected void waitForAllThreadsToComplete() throws InterruptedException { - countDownLatch.await(); - } - - @Ensures({ - "activeThreads.size() <= old(activeThreads.size())", - "! activeThreads.contains(thread)", - "countDownLatch.getCount() <= old(countDownLatch.getCount())" - }) - @Override - public synchronized void threadIsDone(final Thread thread) { - nThreadsAnalyzed++; - - if ( DEBUG ) logger.warn(" Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); - - super.threadIsDone(thread); - - // remove the thread from the list of active activeThreads, if it's in there, and decrement the countdown latch - if ( activeThreads.remove(thread) ) { - // one less thread is live for those blocking on all activeThreads to be complete - countDownLatch.countDown(); - if ( DEBUG ) logger.warn(" -> Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); - } - } - - /** - * Create a new thread from this factory - * - * @param runnable - * @return - */ - @Override - @Ensures({ - "activeThreads.size() > old(activeThreads.size())", - "activeThreads.contains(result)", - "nThreadsCreated == old(nThreadsCreated) + 1" - }) - public synchronized Thread newThread(final Runnable runnable) { - if ( activeThreads.size() >= nThreadsToCreate) - throw new IllegalStateException("Attempting to create more activeThreads than allowed by constructor argument nThreadsToCreate " + nThreadsToCreate); - - nThreadsCreated++; - final Thread myThread = new TrackingThread(runnable); - activeThreads.add(myThread); - return myThread; - } - - /** - * A wrapper around Thread that tracks the runtime of the thread and calls threadIsDone() when complete - */ - private class TrackingThread extends Thread { - private TrackingThread(Runnable runnable) { - super(runnable); - } - - @Override - public void run() { - super.run(); - threadIsDone(this); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java deleted file mode 100644 index b25375b87..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java +++ /dev/null @@ -1,26 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import java.util.concurrent.ThreadFactory; - -/** - * Thread factor that produces threads with a given name pattern - * - * User: depristo - * Date: 9/5/12 - * Time: 9:22 PM - * - */ -public class NamedThreadFactory implements ThreadFactory { - static int id = 0; - final String format; - - public NamedThreadFactory(String format) { - this.format = format; - String.format(format, id); // test the name - } - - @Override - public Thread newThread(Runnable r) { - return new Thread(r, String.format(format, id++)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java deleted file mode 100644 index 9159f5657..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java +++ /dev/null @@ -1,207 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.apache.log4j.Priority; -import org.broadinstitute.sting.utils.AutoFormattingTime; - -import java.lang.management.ManagementFactory; -import java.lang.management.ThreadInfo; -import java.lang.management.ThreadMXBean; -import java.util.EnumMap; -import java.util.concurrent.TimeUnit; - -/** - * Uses an MXBean to monitor thread efficiency - * - * Once the monitor is created, calls to threadIsDone() can be used to add information - * about the efficiency of the provided thread to this monitor. - * - * Provides simple print() for displaying efficiency information to a logger - * - * User: depristo - * Date: 8/22/12 - * Time: 10:48 AM - */ -@Invariant({"nThreadsAnalyzed >= 0"}) -public class ThreadEfficiencyMonitor { - protected static final boolean DEBUG = false; - protected static Logger logger = Logger.getLogger(EfficiencyMonitoringThreadFactory.class); - final EnumMap times = new EnumMap(State.class); - - /** - * The number of threads we've included in our efficiency monitoring - */ - int nThreadsAnalyzed = 0; - - /** - * The bean used to get the thread info about blocked and waiting times - */ - final ThreadMXBean bean; - - public ThreadEfficiencyMonitor() { - bean = ManagementFactory.getThreadMXBean(); - - // get the bean, and start tracking - if ( bean.isThreadContentionMonitoringSupported() ) - bean.setThreadContentionMonitoringEnabled(true); - else - logger.warn("Thread contention monitoring not supported, we cannot track GATK multi-threaded efficiency"); - //bean.setThreadCpuTimeEnabled(true); - - if ( bean.isThreadCpuTimeSupported() ) - bean.setThreadCpuTimeEnabled(true); - else - logger.warn("Thread CPU monitoring not supported, we cannot track GATK multi-threaded efficiency"); - - // initialize times to 0 - for ( final State state : State.values() ) - times.put(state, 0l); - } - - private static long nanoToMilli(final long timeInNano) { - return TimeUnit.NANOSECONDS.toMillis(timeInNano); - } - - /** - * Get the time spent in state across all threads created by this factory - * - * @param state to get information about - * @return the time in milliseconds - */ - @Ensures({"result >= 0"}) - public synchronized long getStateTime(final State state) { - return times.get(state); - } - - /** - * Get the total time spent in all states across all threads created by this factory - * - * @return the time in milliseconds - */ - @Ensures({"result >= 0"}) - public synchronized long getTotalTime() { - long total = 0; - for ( final long time : times.values() ) - total += time; - return total; - } - - /** - * Get the fraction of time spent in state across all threads created by this factory - * - * @return the percentage (0.0-100.0) of time spent in state over all state times of all threads - */ - @Ensures({"result >= 0.0", "result <= 100.0"}) - public synchronized double getStatePercent(final State state) { - return (100.0 * getStateTime(state)) / Math.max(getTotalTime(), 1); - } - - public int getnThreadsAnalyzed() { - return nThreadsAnalyzed; - } - - @Override - public synchronized String toString() { - final StringBuilder b = new StringBuilder(); - - b.append("total ").append(getTotalTime()).append(" "); - for ( final State state : State.values() ) { - b.append(state).append(" ").append(getStateTime(state)).append(" "); - } - - return b.toString(); - } - - /** - * Print usage information about threads from this factory to logger - * with the INFO priority - * - * @param logger - */ - public synchronized void printUsageInformation(final Logger logger) { - printUsageInformation(logger, Priority.INFO); - } - - /** - * Print usage information about threads from this factory to logger - * with the provided priority - * - * @param logger - */ - public synchronized void printUsageInformation(final Logger logger, final Priority priority) { - logger.debug("Number of threads monitored: " + getnThreadsAnalyzed()); - logger.debug("Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toSeconds(getTotalTime()))); - for ( final State state : State.values() ) { - logger.debug(String.format("\tPercent of time spent %s is %.2f", state.getUserFriendlyName(), getStatePercent(state))); - } - logger.log(priority, String.format("CPU efficiency : %6.2f%% of time spent %s", getStatePercent(State.USER_CPU), State.USER_CPU.getUserFriendlyName())); - logger.log(priority, String.format("Walker inefficiency : %6.2f%% of time spent %s", getStatePercent(State.BLOCKING), State.BLOCKING.getUserFriendlyName())); - logger.log(priority, String.format("I/O inefficiency : %6.2f%% of time spent %s", getStatePercent(State.WAITING_FOR_IO), State.WAITING_FOR_IO.getUserFriendlyName())); - logger.log(priority, String.format("Thread inefficiency : %6.2f%% of time spent %s", getStatePercent(State.WAITING), State.WAITING.getUserFriendlyName())); - } - - /** - * Update the information about completed thread that ran for runtime in milliseconds - * - * This method updates all of the key timing and tracking information in the factory so that - * thread can be retired. After this call the factory shouldn't have a pointer to the thread any longer - * - * @param thread the thread whose information we are updating - */ - @Ensures({ - "getTotalTime() >= old(getTotalTime())" - }) - public synchronized void threadIsDone(final Thread thread) { - nThreadsAnalyzed++; - - if ( DEBUG ) logger.warn("UpdateThreadInfo called"); - - final long threadID = thread.getId(); - final ThreadInfo info = bean.getThreadInfo(thread.getId()); - final long totalTimeNano = bean.getThreadCpuTime(threadID); - final long userTimeNano = bean.getThreadUserTime(threadID); - final long systemTimeNano = totalTimeNano - userTimeNano; - final long userTimeInMilliseconds = nanoToMilli(userTimeNano); - final long systemTimeInMilliseconds = nanoToMilli(systemTimeNano); - - if ( info != null ) { - if ( DEBUG ) logger.warn("Updating thread with user runtime " + userTimeInMilliseconds + " and system runtime " + systemTimeInMilliseconds + " of which blocked " + info.getBlockedTime() + " and waiting " + info.getWaitedTime()); - incTimes(State.BLOCKING, info.getBlockedTime()); - incTimes(State.WAITING, info.getWaitedTime()); - incTimes(State.USER_CPU, userTimeInMilliseconds); - incTimes(State.WAITING_FOR_IO, systemTimeInMilliseconds); - } - } - - /** - * Helper function that increments the times counter by by for state - * - * @param state - * @param by - */ - @Requires({"state != null", "by >= 0"}) - @Ensures("getTotalTime() == old(getTotalTime()) + by") - private synchronized void incTimes(final State state, final long by) { - times.put(state, times.get(state) + by); - } - - public enum State { - BLOCKING("blocking on synchronized data structures"), - WAITING("waiting on some other thread"), - USER_CPU("doing productive CPU work"), - WAITING_FOR_IO("waiting for I/O"); - - private final String userFriendlyName; - - private State(String userFriendlyName) { - this.userFriendlyName = userFriendlyName; - } - - public String getUserFriendlyName() { - return userFriendlyName; - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java deleted file mode 100644 index 924c6ec5a..000000000 --- a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java +++ /dev/null @@ -1,41 +0,0 @@ -package org.broadinstitute.sting.commandline; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 8/31/12 - * Time: 11:03 AM - * To change this template use File | Settings | File Templates. - */ -public class InvalidArgumentIntegrationTest extends WalkerTest { - private static final String callsB36 = BaseTest.validationDataLocation + "lowpass.N3.chr1.raw.vcf"; - - private WalkerTest.WalkerTestSpec baseTest(String flag, String arg, Class exeption) { - return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf " - + callsB36 + " -F POS,CHROM -R " - + b36KGReference + " -o %s " + flag + " " + arg, - 1, exeption); - - } - - @Test - public void testUnknownReadFilter() { - executeTest("UnknownReadFilter",baseTest("-rf","TestUnknownReadFilter", UserException.MalformedReadFilterException.class)); - } - - @Test - public void testMalformedWalkerArgs() { - executeTest("MalformedWalkerArgs", - new WalkerTest.WalkerTestSpec("-T UnknownWalkerName -M 10 --variant:vcf " - + callsB36 + " -F POS,CHROM -R " - + b36KGReference + " -o %s ", - 1, UserException.MalformedWalkerArgumentsException.class)); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java deleted file mode 100644 index 2717d014c..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; -import org.testng.Assert; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedList; -import java.util.List; - -public class LevelingDownsamplerUnitTest extends BaseTest { - - private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { - public enum DataStructure { LINKED_LIST, ARRAY_LIST } - - int targetSize; - int numStacks; - int stackSize; - DataStructure dataStructure; - int expectedSize; - - public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { - super(LevelingDownsamplerUniformStacksTest.class); - - this.targetSize = targetSize; - this.numStacks = numStacks; - this.stackSize = stackSize; - this.dataStructure = dataStructure; - expectedSize = calculateExpectedDownsampledStackSize(); - - setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", - getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); - } - - public Collection> createStacks() { - Collection> stacks = new ArrayList>(); - - for ( int i = 1; i <= numStacks; i++ ) { - List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); - - for ( int j = 1; j <= stackSize; j++ ) { - stack.add(new Object()); - } - - stacks.add(stack); - } - - return stacks; - } - - private int calculateExpectedDownsampledStackSize() { - int numItemsToRemove = numStacks * stackSize - targetSize; - - if ( numStacks == 0 ) { - return 0; - } - else if ( numItemsToRemove <= 0 ) { - return stackSize; - } - - return Math.max(1, stackSize - (numItemsToRemove / numStacks)); - } - } - - @DataProvider(name = "UniformStacksDataProvider") - public Object[][] createUniformStacksTestData() { - for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { - for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { - for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { - for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { - new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); - } - } - } - } - - return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); - } - - @Test( dataProvider = "UniformStacksDataProvider" ) - public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); - - downsampler.submit(test.createStacks()); - - if ( test.numStacks > 0 ) { - Assert.assertFalse(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() == null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.numStacks > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - List> downsampledStacks = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - Assert.assertEquals(downsampledStacks.size(), test.numStacks); - - int totalRemainingItems = 0; - for ( List stack : downsampledStacks ) { - Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); - totalRemainingItems += stack.size(); - } - - int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); - int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; - - Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); - - downsampler.reset(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - - Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java deleted file mode 100644 index b9022900b..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java +++ /dev/null @@ -1,298 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.ArtificialMultiSampleReadStream; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; -import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest { - - private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider { - - // TODO: tests should distinguish between variance across samples and variance within a sample - - private enum StreamDensity { - SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2), - DENSE (1, MIN_READ_LENGTH), - MIXED (1, MAX_READ_LENGTH * 2), - UNIFORM_DENSE (1, 1), - UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2); - - int minDistanceBetweenStacks; - int maxDistanceBetweenStacks; - - StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) { - this.minDistanceBetweenStacks = minDistanceBetweenStacks; - this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; - } - - public String toString() { - return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks); - } - } - - private enum StreamStackDepth { - NON_UNIFORM_LOW (1, 5), - NON_UNIFORM_HIGH (15, 20), - NON_UNIFORM_MIXED (1, 20), - UNIFORM_SINGLE (1, 1), - UNIFORM_LOW (2, 2), - UNIFORM_HIGH (20, 20), - UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing - - int minReadsPerStack; - int maxReadsPerStack; - - StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) { - this.minReadsPerStack = minReadsPerStack; - this.maxReadsPerStack = maxReadsPerStack; - } - - public boolean isUniform() { - return minReadsPerStack == maxReadsPerStack; - } - - public String toString() { - return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack); - } - } - - private enum StreamStacksPerContig { - UNIFORM(20, 20), - NON_UNIFORM(1, 30); - - int minStacksPerContig; - int maxStacksPerContig; - - StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) { - this.minStacksPerContig = minStacksPerContig; - this.maxStacksPerContig = maxStacksPerContig; - } - - public boolean isUniform() { - return minStacksPerContig == maxStacksPerContig; - } - - public String toString() { - return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig); - } - } - - // Not interested in testing multiple ranges for the read lengths, as none of our current - // downsamplers are affected by read length - private static final int MIN_READ_LENGTH = 50; - private static final int MAX_READ_LENGTH = 150; - - private ReadsDownsamplerFactory downsamplerFactory; - private int targetCoverage; - private int numSamples; - private int minContigs; - private int maxContigs; - private StreamDensity streamDensity; - private StreamStackDepth streamStackDepth; - private StreamStacksPerContig streamStacksPerContig; - private double unmappedReadsFraction; - private int unmappedReadsCount; - private boolean verifySortedness; - - private ArtificialMultiSampleReadStream mergedReadStream; - private Map perSampleArtificialReadStreams; - private Map perSampleStreamAnalyzers; - private SAMFileHeader header; - - public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory downsamplerFactory, - int targetCoverage, - int numSamples, - int minContigs, - int maxContigs, - StreamDensity streamDensity, - StreamStackDepth streamStackDepth, - StreamStacksPerContig streamStacksPerContig, - double unmappedReadsFraction, - int unmappedReadsCount, - boolean verifySortedness ) { - super(PerSampleDownsamplingReadsIteratorTest.class); - - this.downsamplerFactory = downsamplerFactory; - this.targetCoverage = targetCoverage; - this.numSamples = numSamples; - this.minContigs = minContigs; - this.maxContigs = maxContigs; - this.streamDensity = streamDensity; - this.streamStackDepth = streamStackDepth; - this.streamStacksPerContig = streamStacksPerContig; - this.unmappedReadsFraction = unmappedReadsFraction; - this.unmappedReadsCount = unmappedReadsCount; - this.verifySortedness = verifySortedness; - - header = createHeader(); - createReadStreams(); - - setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b", - getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness)); - } - - private SAMFileHeader createHeader() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000); - List readGroups = new ArrayList(numSamples); - List sampleNames = new ArrayList(numSamples); - - for ( int i = 0; i < numSamples; i++ ) { - readGroups.add("ReadGroup" + i); - sampleNames.add("Sample" + i); - } - - return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames); - } - - private void createReadStreams() { - perSampleArtificialReadStreams = new HashMap(numSamples); - perSampleStreamAnalyzers = new HashMap(numSamples); - - for (SAMReadGroupRecord readGroup : header.getReadGroups() ) { - String readGroupID = readGroup.getReadGroupId(); - String sampleName = readGroup.getSample(); - - int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs); - int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig); - - int thisSampleNumUnmappedReads = GenomeAnalysisEngine.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0; - - ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header, - readGroupID, - thisSampleNumContigs, - thisSampleStacksPerContig, - streamStackDepth.minReadsPerStack, - streamStackDepth.maxReadsPerStack, - streamDensity.minDistanceBetweenStacks, - streamDensity.maxDistanceBetweenStacks, - MIN_READ_LENGTH, - MAX_READ_LENGTH, - thisSampleNumUnmappedReads); - perSampleArtificialReadStreams.put(sampleName, thisSampleStream); - perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage)); - } - - mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values()); - } - - public void run() { - StingSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getStingSAMIterator(), downsamplerFactory); - - if ( verifySortedness ) { - downsamplingIter = new VerifyingSamIterator(downsamplingIter); - } - - while ( downsamplingIter.hasNext() ) { - SAMRecord read = downsamplingIter.next(); - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - - ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName); - if ( analyzer != null ) { - analyzer.update(read); - } - else { - throw new ReviewedStingException("bug: stream analyzer for sample " + sampleName + " not found"); - } - } - - for ( Map.Entry analyzerEntry : perSampleStreamAnalyzers.entrySet() ) { - ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue(); - analyzer.finalizeStats(); - - // Validate the downsampled read stream for each sample individually - analyzer.validate(); - } - - // Allow memory used by this test to be reclaimed: - mergedReadStream = null; - perSampleArtificialReadStreams = null; - perSampleStreamAnalyzers = null; - } - } - - @DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider") - public Object[][] createPerSampleDownsamplingReadsIteratorTests() { - - GenomeAnalysisEngine.resetRandomGenerator(); - - // Some values don't vary across tests - int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack; - ReadsDownsamplerFactory downsamplerFactory = new SimplePositionalDownsamplerFactory(targetCoverage); - int maxContigs = 3; - boolean verifySortedness = true; - - for ( int numSamples : Arrays.asList(1, 2, 10) ) { - for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) { - for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) { - for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) { - for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) { - for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) { - for ( int unmappedReadsCount : Arrays.asList(1, 50) ) { - new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory, - targetCoverage, - numSamples, - minContigs, - maxContigs, - streamDensity, - streamStackDepth, - streamStacksPerContig, - unmappedReadsFraction, - unmappedReadsCount, - verifySortedness); - } - } - } - } - } - } - } - - return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class); - } - - @Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider") - public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - test.run(); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java deleted file mode 100644 index 9cbd0db8a..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; -import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; - -/** - * Class for analyzing an artificial read stream that has been positionally downsampled, and verifying - * that the downsampling was done correctly without changing the stream in unexpected ways. - * - * @author David Roazen - */ -public class PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer extends ArtificialSingleSampleReadStreamAnalyzer { - private int targetCoverage; - - public PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream, int targetCoverage ) { - super(originalStream); - this.targetCoverage = targetCoverage; - } - - /** - * Overridden validate() method that checks for the effects of positional downsampling in addition to checking - * for whether the original properties of the stream not affected by downsampling have been preserved - */ - @Override - public void validate() { - if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { - if ( totalReads != 0 ) { - throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); - } - return; // no further validation needed for the 0-reads case - } - else if ( totalReads == 0 ) { - throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); - } - - if ( ! allSamplesMatch ) { - throw new ReviewedStingException("some reads had the wrong sample"); - } - - if ( numContigs != originalStream.getNumContigs() ) { - throw new ReviewedStingException("number of contigs not correct"); - } - - if ( stacksPerContig.size() != originalStream.getNumContigs() ) { - throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", - stacksPerContig.size(), originalStream.getNumContigs())); - } - - for ( int contigStackCount : stacksPerContig ) { - if ( contigStackCount != originalStream.getNumStacksPerContig() ) { - throw new ReviewedStingException("contig had incorrect number of stacks"); - } - } - - if ( originalStream.getNumStacksPerContig() > 0 ) { - - // Check for the effects of positional downsampling: - int stackMinimumAfterDownsampling = Math.min(targetCoverage, originalStream.getMinReadsPerStack()); - int stackMaximumAfterDownsampling = targetCoverage; - - if ( minReadsPerStack < stackMinimumAfterDownsampling ) { - throw new ReviewedStingException("stack had fewer than the minimum number of reads after downsampling"); - } - if ( maxReadsPerStack > stackMaximumAfterDownsampling ) { - throw new ReviewedStingException("stack had more than the maximum number of reads after downsampling"); - } - } - else if ( minReadsPerStack != null || maxReadsPerStack != null ) { - throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); - } - - if ( originalStream.getNumStacksPerContig() > 1 ) { - if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { - throw new ReviewedStingException("stacks were separated by less than the minimum distance"); - } - if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { - throw new ReviewedStingException("stacks were separated by more than the maximum distance"); - } - } - else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { - throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); - } - - if ( minReadLength < originalStream.getMinReadLength() ) { - throw new ReviewedStingException("read was shorter than the minimum allowed length"); - } - if ( maxReadLength > originalStream.getMaxReadLength() ) { - throw new ReviewedStingException("read was longer than the maximum allowed length"); - } - - if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { - throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", - originalStream.getNumUnmappedReads(), numUnmappedReads)); - } - - if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && - numUnmappedReads != totalReads ) { - throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java deleted file mode 100644 index 75d0448c4..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -public class ReservoirDownsamplerUnitTest extends BaseTest { - - private static class ReservoirDownsamplerTest extends TestDataProvider { - int reservoirSize; - int totalReads; - int expectedNumReadsAfterDownsampling; - int expectedNumDiscardedItems; - - public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { - super(ReservoirDownsamplerTest.class); - - this.reservoirSize = reservoirSize; - this.totalReads = totalReads; - - expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); - expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; - - setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", - getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); - } - - public Collection createReads() { - Collection reads = new ArrayList(totalReads); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); - - return reads; - } - } - - @DataProvider(name = "ReservoirDownsamplerTestDataProvider") - public Object[][] createReservoirDownsamplerTestData() { - for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { - new ReservoirDownsamplerTest(reservoirSize, 0); - for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { - new ReservoirDownsamplerTest(reservoirSize, totalReads); - } - } - - return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); - } - - @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") - public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); - - downsampler.submit(test.createReads()); - - if ( test.totalReads > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.totalReads > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - List downsampledReads = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); - Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); - - downsampler.reset(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java deleted file mode 100644 index 5dc41b4a0..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java +++ /dev/null @@ -1,330 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.*; - -public class SimplePositionalDownsamplerUnitTest extends BaseTest { - - private static class SimplePositionalDownsamplerTest extends TestDataProvider { - int targetCoverage; - int numStacks; - List stackSizes; - List expectedStackSizes; - boolean multipleContigs; - int totalInitialReads; - - public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { - super(SimplePositionalDownsamplerTest.class); - - this.targetCoverage = targetCoverage; - this.numStacks = stackSizes.size(); - this.stackSizes = stackSizes; - this.multipleContigs = multipleContigs; - - calculateExpectedDownsampledStackSizes(); - - totalInitialReads = 0; - for ( Integer stackSize : stackSizes ) { - totalInitialReads += stackSize; - } - - setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", - getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); - } - - public Collection createReads() { - Collection reads = new ArrayList(); - SAMFileHeader header = multipleContigs ? - ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : - ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - int refIndex = 0; - int alignmentStart = 1; - int readLength = 100; - - for ( int i = 0; i < numStacks; i++ ) { - if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { - refIndex++; - } - - reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", - refIndex, alignmentStart, readLength)); - - alignmentStart += 10; - } - - return reads; - } - - private void calculateExpectedDownsampledStackSizes() { - expectedStackSizes = new ArrayList(numStacks); - - for ( Integer stackSize : stackSizes ) { - int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; - expectedStackSizes.add(expectedSize); - } - } - } - - @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") - public Object[][] createSimplePositionalDownsamplerTestData() { - GenomeAnalysisEngine.resetRandomGenerator(); - - for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { - for ( int contigs = 1; contigs <= 2; contigs++ ) { - for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { - List stackSizes = new ArrayList(numStacks); - for ( int stack = 1; stack <= numStacks; stack++ ) { - stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1); - } - new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); - } - } - } - - return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); - } - - @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) - public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); - - downsampler.submit(test.createReads()); - - if ( test.numStacks > 1 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - } - else if ( test.numStacks == 1 ) { - Assert.assertFalse(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() == null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.numStacks > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - List downsampledReads = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - if ( test.numStacks == 0 ) { - Assert.assertTrue(downsampledReads.isEmpty()); - } - else { - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); - - Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); - Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); - - int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); - int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); - Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); - } - - downsampler.reset(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - } - - private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { - List stackSizes = new ArrayList(); - - if ( downsampledReads.isEmpty() ) { - return stackSizes; - } - - Iterator iter = downsampledReads.iterator(); - Assert.assertTrue(iter.hasNext()); - - SAMRecord previousRead = iter.next(); - int currentStackSize = 1; - - while ( iter.hasNext() ) { - SAMRecord currentRead = iter.next(); - - if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { - stackSizes.add(currentStackSize); - currentStackSize = 1; - } - else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { - Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); - } - else { - currentStackSize++; - } - - previousRead = currentRead; - } - - stackSizes.add(currentStackSize); - return stackSizes; - } - - @Test - public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - Collection readStack = new ArrayList(); - readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); - downsampler.submit(readStack); - - Assert.assertFalse(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() == null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - - SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); - downsampler.signalNoMoreReadsBefore(laterRead); - - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - - List downsampledReads = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(downsampledReads.size(), readStack.size()); - } - - @Test - public void testBasicUnmappedReadsSupport() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - Collection readStack = new ArrayList(); - readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, - SAMRecord.NO_ALIGNMENT_START, 100)); - for ( SAMRecord read : readStack ) { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - - downsampler.submit(readStack); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeFinalizedItems(); - - // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler - Assert.assertEquals(downsampledReads.size(), readStack.size()); - - for ( SAMRecord read: downsampledReads ) { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - } - - @Test - public void testMixedMappedAndUnmappedReadsSupport() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - Collection mappedReadStack = new ArrayList(); - mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); - for ( SAMRecord read : mappedReadStack ) { - Assert.assertFalse(read.getReadUnmappedFlag()); - } - - Collection unmappedReadStack = new ArrayList(); - unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, - SAMRecord.NO_ALIGNMENT_START, 100)); - for ( SAMRecord read : unmappedReadStack ) { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - - downsampler.submit(mappedReadStack); - downsampler.submit(unmappedReadStack); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeFinalizedItems(); - - // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler - Assert.assertEquals(downsampledReads.size(), 300); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); - - int count = 1; - for ( SAMRecord read: downsampledReads ) { - if ( count <= 100 ) { - Assert.assertFalse(read.getReadUnmappedFlag()); - } - else { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - - count++; - } - } - - @Test - public void testGATKSAMRecordSupport() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - List reads = new ArrayList(); - for ( int i = 0; i < 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - List downsampledReads = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(downsampledReads.size(), 10); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java deleted file mode 100644 index c148bcf84..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java +++ /dev/null @@ -1,546 +0,0 @@ -package org.broadinstitute.sting.gatk.iterators; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.ReadProperties; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * testing of the experimental version of LocusIteratorByState - */ -public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { - private static SAMFileHeader header; - private LocusIteratorByStateExperimental li; - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void beforeClass() { - header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - } - - private final LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { - return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); - } - - private static ReadProperties createTestReadProperties() { - return createTestReadProperties(null); - } - - private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { - return new ReadProperties( - Collections.emptyList(), - new SAMFileHeader(), - false, - SAMFileReader.ValidationStringency.STRICT, - downsamplingMethod, - new ValidationExclusion(), - Collections.emptyList(), - Collections.emptyList(), - false, - (byte) -1 - ); - } - - private static class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; - - public FakeCloseableIterator(Iterator it) { - iterator = it; - } - - @Override - public void close() { - return; - } - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public T next() { - return iterator.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } - } - - @Test - public void testXandEQOperators() { - final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; - final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'}; - - // create a test version of the Reads object - ReadProperties readAttributes = createTestReadProperties(); - - SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10); - r1.setReadBases(bases1); - r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); - r1.setCigarString("10M"); - - SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10); - r2.setReadBases(bases2); - r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); - r2.setCigarString("3=1X5=1X"); - - SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10); - r3.setReadBases(bases2); - r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); - r3.setCigarString("3=1X5M1X"); - - SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10); - r4.setReadBases(bases2); - r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); - r4.setCigarString("10M"); - - List reads = Arrays.asList(r1, r2, r3, r4); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads,readAttributes); - - while (li.hasNext()) { - AlignmentContext context = li.next(); - ReadBackedPileup pileup = context.getBasePileup(); - Assert.assertEquals(pileup.depthOfCoverage(), 4); - } - } - - @Test - public void testIndelsInRegularPileup() { - final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; - final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'}; - - // create a test version of the Reads object - ReadProperties readAttributes = createTestReadProperties(); - - SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); - before.setReadBases(bases); - before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); - before.setCigarString("10M"); - - SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); - during.setReadBases(indelBases); - during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); - during.setCigarString("4M2I6M"); - - SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); - after.setReadBases(bases); - after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); - after.setCigarString("10M"); - - List reads = Arrays.asList(before, during, after); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads,readAttributes); - - boolean foundIndel = false; - while (li.hasNext()) { - AlignmentContext context = li.next(); - ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); - for (PileupElement p : pileup) { - if (p.isBeforeInsertion()) { - foundIndel = true; - Assert.assertEquals(p.getEventLength(), 2, "Wrong event length"); - Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect"); - break; - } - } - - } - - Assert.assertTrue(foundIndel,"Indel in pileup not found"); - } - - @Test - public void testWholeIndelReadInIsolation() { - final int firstLocus = 44367789; - - // create a test version of the Reads object - ReadProperties readAttributes = createTestReadProperties(); - - SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); - indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); - indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); - indelOnlyRead.setCigarString("76I"); - - List reads = Arrays.asList(indelOnlyRead); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads, readAttributes); - - // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read - // and considers it to be an indel-containing read. - Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); - AlignmentContext alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); - ReadBackedPileup basePileup = alignmentContext.getBasePileup(); - Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); - Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); - } - - /** - * Test to make sure that reads supporting only an indel (example cigar string: 76I) do - * not negatively influence the ordering of the pileup. - */ - @Test - public void testWholeIndelRead() { - final int firstLocus = 44367788, secondLocus = firstLocus + 1; - - SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); - leadingRead.setReadBases(Utils.dupBytes((byte)'A',76)); - leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); - leadingRead.setCigarString("1M75I"); - - SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); - indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); - indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); - indelOnlyRead.setCigarString("76I"); - - SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76); - fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76)); - fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); - fullMatchAfterIndel.setCigarString("75I1M"); - - List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads, createTestReadProperties()); - int currentLocus = firstLocus; - int numAlignmentContextsFound = 0; - - while(li.hasNext()) { - AlignmentContext alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect"); - - if(currentLocus == firstLocus) { - List readsAtLocus = alignmentContext.getBasePileup().getReads(); - Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); - Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus); - } - else if(currentLocus == secondLocus) { - List readsAtLocus = alignmentContext.getBasePileup().getReads(); - Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus); - Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus); - Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus); - } - - currentLocus++; - numAlignmentContextsFound++; - } - - Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); - } - - /** - * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly - */ - @Test - public void testWholeIndelReadRepresentedTest() { - final int firstLocus = 44367788, secondLocus = firstLocus + 1; - - SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); - read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); - read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); - read1.setCigarString("1I"); - - List reads = Arrays.asList(read1); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads, createTestReadProperties()); - - while(li.hasNext()) { - AlignmentContext alignmentContext = li.next(); - ReadBackedPileup p = alignmentContext.getBasePileup(); - Assert.assertTrue(p.getNumberOfElements() == 1); - PileupElement pe = p.iterator().next(); - Assert.assertTrue(pe.isBeforeInsertion()); - Assert.assertFalse(pe.isAfterInsertion()); - Assert.assertEquals(pe.getEventBases(), "A"); - } - - SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); - read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); - read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); - read2.setCigarString("10I"); - - reads = Arrays.asList(read2); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads, createTestReadProperties()); - - while(li.hasNext()) { - AlignmentContext alignmentContext = li.next(); - ReadBackedPileup p = alignmentContext.getBasePileup(); - Assert.assertTrue(p.getNumberOfElements() == 1); - PileupElement pe = p.iterator().next(); - Assert.assertTrue(pe.isBeforeInsertion()); - Assert.assertFalse(pe.isAfterInsertion()); - Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); - } - } - - //////////////////////////////////////////// - // comprehensive LIBS/PileupElement tests // - //////////////////////////////////////////// - - private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; - private static final int IS_BEFORE_DELETION_START_FLAG = 2; - private static final int IS_AFTER_DELETED_BASE_FLAG = 4; - private static final int IS_AFTER_DELETION_END_FLAG = 8; - private static final int IS_BEFORE_INSERTION_FLAG = 16; - private static final int IS_AFTER_INSERTION_FLAG = 32; - private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; - - private static class LIBSTest { - - - final String cigar; - final int readLength; - final List offsets; - final List flags; - - private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { - this.cigar = cigar; - this.readLength = readLength; - this.offsets = offsets; - this.flags = flags; - } - } - - @DataProvider(name = "LIBSTest") - public Object[][] createLIBSTestData() { - return new Object[][]{ - {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, - {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, - {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, - {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, - //TODO -- uncomment these when LIBS is fixed - //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, - //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, - //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, - {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, - {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, - {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, - {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} - }; - } - - @Test(dataProvider = "LIBSTest") - public void testLIBS(LIBSTest params) { - final int locus = 44367788; - - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); - read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); - read.setCigarString(params.cigar); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(Arrays.asList(read), createTestReadProperties()); - - int offset = 0; - while ( li.hasNext() ) { - AlignmentContext alignmentContext = li.next(); - ReadBackedPileup p = alignmentContext.getBasePileup(); - Assert.assertTrue(p.getNumberOfElements() == 1); - PileupElement pe = p.iterator().next(); - - final int flag = params.flags.get(offset); - Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); - Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); - Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); - Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); - Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); - Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); - Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); - - Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); - - offset++; - } - } - - //////////////////////////////////////////////// - // End comprehensive LIBS/PileupElement tests // - //////////////////////////////////////////////// - - - /////////////////////////////////////// - // Read State Manager Tests // - /////////////////////////////////////// - - private class PerSampleReadStateManagerTest extends TestDataProvider { - private List readCountsPerAlignmentStart; - private List reads; - private List> recordStatesByAlignmentStart; - private int removalInterval; - - public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { - super(PerSampleReadStateManagerTest.class); - - this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; - this.removalInterval = removalInterval; - - reads = new ArrayList(); - recordStatesByAlignmentStart = new ArrayList>(); - - setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", - getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); - } - - public void run() { - LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); - LocusIteratorByStateExperimental.ReadStateManager readStateManager = - libs.new ReadStateManager(new ArrayList().iterator()); - LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = - readStateManager.new PerSampleReadStateManager(); - - makeReads(); - - for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { - perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); - } - - // read state manager should have the right number of reads - Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); - - Iterator originalReadsIterator = reads.iterator(); - Iterator recordStateIterator = perSampleReadStateManager.iterator(); - int recordStateCount = 0; - int numReadStatesRemoved = 0; - - // Do a first-pass validation of the record state iteration by making sure we get back everything we - // put in, in the same order, doing any requested removals of read states along the way - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - recordStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - SAMRecord originalRead = originalReadsIterator.next(); - - // The read we get back should be literally the same read in memory as we put in - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - - // If requested, remove a read state every removalInterval states - if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { - recordStateIterator.remove(); - numReadStatesRemoved++; - } - } - - Assert.assertFalse(originalReadsIterator.hasNext()); - - // If we removed any read states, do a second pass through the read states to make sure the right - // states were removed - if ( numReadStatesRemoved > 0 ) { - Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); - - originalReadsIterator = reads.iterator(); - recordStateIterator = perSampleReadStateManager.iterator(); - int readCount = 0; - int readStateCount = 0; - - // Match record states with the reads that should remain after removal - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - readStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - - SAMRecord originalRead = originalReadsIterator.next(); - readCount++; - - if ( readCount % removalInterval == 0 ) { - originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded - readCount++; - } - - // The read we get back should be literally the same read in memory as we put in (after accounting for removals) - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - } - - Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); - } - - // Allow memory used by this test to be reclaimed - readCountsPerAlignmentStart = null; - reads = null; - recordStatesByAlignmentStart = null; - } - - private void makeReads() { - int alignmentStart = 1; - - for ( int readsThisStack : readCountsPerAlignmentStart ) { - ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); - ArrayList stackRecordStates = new ArrayList(); - - for ( SAMRecord read : stackReads ) { - stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); - } - - reads.addAll(stackReads); - recordStatesByAlignmentStart.add(stackRecordStates); - } - } - } - - @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") - public Object[][] createPerSampleReadStateManagerTests() { - for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), - Arrays.asList(2), - Arrays.asList(10), - Arrays.asList(1, 1), - Arrays.asList(2, 2), - Arrays.asList(10, 10), - Arrays.asList(1, 10), - Arrays.asList(10, 1), - Arrays.asList(1, 1, 1), - Arrays.asList(2, 2, 2), - Arrays.asList(10, 10, 10), - Arrays.asList(1, 1, 1, 1, 1, 1), - Arrays.asList(10, 10, 10, 10, 10, 10), - Arrays.asList(1, 2, 10, 1, 2, 10) - ) ) { - - for ( int removalInterval : Arrays.asList(0, 2, 3) ) { - new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); - } - } - - return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); - } - - @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") - public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { - logger.warn("Running test: " + test); - - test.run(); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java deleted file mode 100644 index 5b052454a..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java +++ /dev/null @@ -1,166 +0,0 @@ -package org.broadinstitute.sting.utils; - -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.Test; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileHeader; - -import java.util.*; - -/** - * Basic tests to prove the integrity of the reservoir downsampler. - * At the moment, always run tests on SAM records as that's the task - * for which the downsampler was conceived. - * - * @author mhanna - * @version 0.1 - */ -public class LegacyReservoirDownsamplerUnitTest { - private static final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,200); - - - @Test - public void testEmptyIterator() { - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); - Assert.assertTrue(downsampler.isEmpty(),"Downsampler is not empty but should be."); - } - - @Test - public void testOneElementWithPoolSizeOne() { - List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); - downsampler.addAll(reads); - - Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); - Collection batchedReads = downsampler.getDownsampledContents(); - Assert.assertEquals(batchedReads.size(), 1, "Downsampler is returning the wrong number of reads"); - Assert.assertSame(batchedReads.iterator().next(), reads.get(0), "Downsampler is returning an incorrect read"); - } - - @Test - public void testOneElementWithPoolSizeGreaterThanOne() { - List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); - downsampler.addAll(reads); - - Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); - Collection batchedReads = downsampler.getDownsampledContents(); - Assert.assertEquals(batchedReads.size(), 1, "Downsampler is returning the wrong number of reads"); - Assert.assertSame(batchedReads.iterator().next(), reads.get(0), "Downsampler is returning an incorrect read"); - - } - - @Test - public void testPoolFilledPartially() { - List reads = new ArrayList(); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); - downsampler.addAll(reads); - - Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); - List batchedReads = new ArrayList(downsampler.getDownsampledContents()); - Assert.assertEquals(batchedReads.size(), 3, "Downsampler is returning the wrong number of reads"); - - Assert.assertSame(batchedReads.get(0), reads.get(0), "Downsampler read 1 is incorrect"); - Assert.assertSame(batchedReads.get(1), reads.get(1), "Downsampler read 2 is incorrect"); - Assert.assertSame(batchedReads.get(2), reads.get(2), "Downsampler read 3 is incorrect"); - } - - @Test - public void testPoolFilledExactly() { - List reads = new ArrayList(); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); - downsampler.addAll(reads); - - Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); - List batchedReads = new ArrayList(downsampler.getDownsampledContents()); - Assert.assertEquals(batchedReads.size(), 5, "Downsampler is returning the wrong number of reads"); - Assert.assertSame(batchedReads.iterator().next(), reads.get(0), "Downsampler is returning an incorrect read"); - - Assert.assertSame(batchedReads.get(0), reads.get(0), "Downsampler read 1 is incorrect"); - Assert.assertSame(batchedReads.get(1), reads.get(1), "Downsampler read 2 is incorrect"); - Assert.assertSame(batchedReads.get(2), reads.get(2), "Downsampler read 3 is incorrect"); - Assert.assertSame(batchedReads.get(3), reads.get(3), "Downsampler read 4 is incorrect"); - Assert.assertSame(batchedReads.get(4), reads.get(4), "Downsampler read 5 is incorrect"); - } - - @Test - public void testLargerPileWithZeroElementPool() { - List reads = new ArrayList(); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(0); - downsampler.addAll(reads); - - Assert.assertTrue(downsampler.isEmpty(),"Downsampler isn't empty but should be"); - List batchedReads = new ArrayList(downsampler.getDownsampledContents()); - Assert.assertEquals(batchedReads.size(), 0, "Downsampler is returning the wrong number of reads"); - } - - @Test - public void testLargerPileWithSingleElementPool() { - List reads = new ArrayList(); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); - downsampler.addAll(reads); - - Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); - List batchedReads = new ArrayList(downsampler.getDownsampledContents()); - Assert.assertEquals(batchedReads.size(), 1, "Downsampler is returning the wrong number of reads"); - Assert.assertTrue(reads.contains(batchedReads.get(0)),"Downsampler is returning a bad read."); - } - - @Test - public void testFillingAcrossLoci() { - List reads = new ArrayList(); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); - downsampler.addAll(reads); - - Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); - List batchedReads = new ArrayList(downsampler.getDownsampledContents()); - Assert.assertEquals(batchedReads.size(), 1, "Downsampler is returning the wrong number of reads"); - Assert.assertEquals(batchedReads.get(0), reads.get(0), "Downsampler is returning an incorrect read."); - - reads.clear(); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,2,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,2,76)); - - downsampler.clear(); - downsampler.addAll(reads); - - Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); - batchedReads = new ArrayList(downsampler.getDownsampledContents()); - Assert.assertEquals(batchedReads.size(), 2, "Downsampler is returning the wrong number of reads"); - Assert.assertEquals(batchedReads.get(0), reads.get(0), "Downsampler is returning an incorrect read."); - Assert.assertEquals(batchedReads.get(1), reads.get(1), "Downsampler is returning an incorrect read."); - - reads.clear(); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,3,76)); - reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,3,76)); - - downsampler.clear(); - downsampler.addAll(reads); - - Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); - batchedReads = new ArrayList(downsampler.getDownsampledContents()); - Assert.assertEquals(batchedReads.size(), 2, "Downsampler is returning the wrong number of reads"); - Assert.assertEquals(batchedReads.get(0), reads.get(0), "Downsampler is returning an incorrect read."); - Assert.assertEquals(batchedReads.get(1), reads.get(1), "Downsampler is returning an incorrect read."); - } - -} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java deleted file mode 100644 index b3365c13c..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingDeque; - -/** - * UnitTests for the InputProducer - * - * User: depristo - * Date: 8/24/12 - * Time: 11:25 AM - * To change this template use File | Settings | File Templates. - */ -public class InputProducerUnitTest extends BaseTest { - @DataProvider(name = "InputProducerTest") - public Object[][] createInputProducerTest() { - List tests = new ArrayList(); - - for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { - for ( final int queueSize : Arrays.asList(1, 10, 100) ) { - tests.add(new Object[]{ nElements, queueSize }); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "InputProducerTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) - public void testInputProducer(final int nElements, final int queueSize) throws InterruptedException { - final List elements = new ArrayList(nElements); - for ( int i = 0; i < nElements; i++ ) elements.add(i); - - final LinkedBlockingDeque.InputValue> readQueue = - new LinkedBlockingDeque.InputValue>(queueSize); - - final InputProducer ip = new InputProducer(elements.iterator(), null, readQueue); - - final ExecutorService es = Executors.newSingleThreadExecutor(); - es.submit(ip); - - int lastValue = -1; - int nRead = 0; - while ( true ) { - final int observedQueueSize = readQueue.size(); - Assert.assertTrue(observedQueueSize <= queueSize, - "Reader is enqueuing more elements " + observedQueueSize + " than allowed " + queueSize); - - final InputProducer.InputValue value = readQueue.take(); - if ( value.isLast() ) { - Assert.assertEquals(nRead, nElements, "Number of input values " + nRead + " not all that are expected " + nElements); - Assert.assertEquals(readQueue.size(), 0, "Last queue element found but queue contains more values!"); - break; - } else { - Assert.assertTrue(lastValue < value.getValue(), "Read values coming out of order!"); - final int expected = lastValue + 1; - Assert.assertEquals((int)value.getValue(), expected, "Value observed " + value.getValue() + " not equal to the expected value " + expected); - nRead++; - lastValue = value.getValue(); - } - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java deleted file mode 100644 index 47dcc1d5e..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ /dev/null @@ -1,182 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import org.apache.log4j.BasicConfigurator; -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -/** - * UnitTests for the NanoScheduler - * - * User: depristo - * Date: 8/24/12 - * Time: 11:25 AM - * To change this template use File | Settings | File Templates. - */ -public class NanoSchedulerUnitTest extends BaseTest { - public static final int NANO_SCHEDULE_MAX_RUNTIME = 60000; - - private static class Map2x implements NSMapFunction { - @Override public Integer apply(Integer input) { return input * 2; } - } - - private static class ReduceSum implements NSReduceFunction { - int prevOne = Integer.MIN_VALUE; - - @Override public Integer apply(Integer one, Integer sum) { - Assert.assertTrue(prevOne < one, "Reduce came in out of order. Prev " + prevOne + " cur " + one); - return one + sum; - } - } - - private static class ProgressCallback implements NSProgressFunction { - int callBacks = 0; - - @Override - public void progress(Integer lastMapInput) { - callBacks++; - } - } - - - private static int sum2x(final int start, final int end) { - int sum = 0; - for ( int i = start; i < end; i++ ) - sum += 2 * i; - return sum; - } - - private static class NanoSchedulerBasicTest extends TestDataProvider { - final int bufferSize, nThreads, start, end, expectedResult; - - public NanoSchedulerBasicTest(final int bufferSize, final int nThreads, final int start, final int end) { - super(NanoSchedulerBasicTest.class); - this.bufferSize = bufferSize; - this.nThreads = nThreads; - this.start = start; - this.end = end; - this.expectedResult = sum2x(start, end); - setName(String.format("%s nt=%d buf=%d start=%d end=%d sum=%d", - getClass().getSimpleName(), nThreads, bufferSize, start, end, expectedResult)); - } - - public Iterator makeReader() { - final List ints = new ArrayList(); - for ( int i = start; i < end; i++ ) - ints.add(i); - return ints.iterator(); - } - - public int nExpectedCallbacks() { - int nElements = Math.max(end - start, 0); - return nElements / bufferSize; - } - - public Map2x makeMap() { return new Map2x(); } - public Integer initReduce() { return 0; } - public ReduceSum makeReduce() { return new ReduceSum(); } - } - - static NanoSchedulerBasicTest exampleTest = null; - @DataProvider(name = "NanoSchedulerBasicTest") - public Object[][] createNanoSchedulerBasicTest() { - for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000) ) { - for ( final int nt : Arrays.asList(1, 2, 4) ) { - for ( final int start : Arrays.asList(0) ) { - for ( final int end : Arrays.asList(0, 1, 2, 11, 10000, 100000) ) { - exampleTest = new NanoSchedulerBasicTest(bufferSize, nt, start, end); - } - } - } - } - - return NanoSchedulerBasicTest.getTests(NanoSchedulerBasicTest.class); - } - - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME) - public void testSingleThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { - logger.warn("Running " + test); - if ( test.nThreads == 1 ) - testNanoScheduler(test); - } - - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME, dependsOnMethods = "testSingleThreadedNanoScheduler") - public void testMultiThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { - logger.warn("Running " + test); - if ( test.nThreads >= 1 ) - testNanoScheduler(test); - } - - private void testNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { - final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.nThreads); - - final ProgressCallback callback = new ProgressCallback(); - nanoScheduler.setProgressFunction(callback); - - Assert.assertEquals(nanoScheduler.getInputBufferSize(), test.bufferSize, "inputBufferSize argument"); - Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); - - final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); - Assert.assertNotNull(sum); - Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); - - Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected. Expected at least " + test.nExpectedCallbacks() + " but saw only " + callback.callBacks); - nanoScheduler.shutdown(); - } - - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) - public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { - if ( test.bufferSize > 1) { - logger.warn("Running " + test); - - final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.nThreads); - - // test reusing the scheduler - for ( int i = 0; i < 10; i++ ) { - final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); - Assert.assertNotNull(sum); - Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); - } - - nanoScheduler.shutdown(); - } - } - - @Test(timeOut = NANO_SCHEDULE_MAX_RUNTIME) - public void testShutdown() throws InterruptedException { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); - Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); - nanoScheduler.shutdown(); - Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); - } - - @Test(expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) - public void testShutdownExecuteFailure() throws InterruptedException { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); - nanoScheduler.shutdown(); - nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); - } - - public static void main(String [ ] args) { - org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); - BasicConfigurator.configure(); - logger.setLevel(org.apache.log4j.Level.DEBUG); - - final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); - final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.nThreads); - nanoScheduler.setDebug(true); - - final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); - System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); - nanoScheduler.shutdown(); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java deleted file mode 100644 index 61d1330bc..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java +++ /dev/null @@ -1,94 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.*; - -/** - * UnitTests for the InputProducer - * - * User: depristo - * Date: 8/24/12 - * Time: 11:25 AM - * To change this template use File | Settings | File Templates. - */ -public class ReducerThreadUnitTest extends BaseTest { - @DataProvider(name = "ReducerThreadTest") - public Object[][] createReducerThreadTest() { - List tests = new ArrayList(); - - for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { - tests.add(new Object[]{ nElements }); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "ReducerThreadTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) - public void testReducerThreadTest(final int nElements) throws Exception { - List values = new ArrayList(nElements); - List jobIDs = new ArrayList(nElements); - for ( int i = 0; i < nElements; i++ ) { - values.add(i); - jobIDs.add(i); - } - - runTests(values, jobIDs); - } - - @Test(enabled = true, timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME, expectedExceptions = ExecutionException.class) - public void testReducerThreadTestByJobOrder() throws Exception { - runTests(Arrays.asList(0, 1, 2), Arrays.asList(1, 3, 2)); - } - - private void runTests( final List mapValues, final List jobIDs) throws Exception { - final LinkedBlockingDeque>> mapResultsQueue = - new LinkedBlockingDeque>>(mapValues.size()+1); - - for ( int i = 0; i < mapValues.size(); i++ ) { - final int value = mapValues.get(i); - final int jobID = jobIDs.get(i); - final MapResult mapResult = new MapResult(value, jobID); - mapResultsQueue.add(new FutureValue>(mapResult)); - } - mapResultsQueue.add(new FutureValue>(new MapResult())); - - final ReduceSumTest reduce = new ReduceSumTest(mapResultsQueue); - final ReducerThread thread - = new ReducerThread(reduce, null, 0, mapResultsQueue); - - final ExecutorService es = Executors.newSingleThreadExecutor(); - final Future value = es.submit(thread); - value.get(); - - Assert.assertEquals(reduce.nRead, mapValues.size()); - } - - public class ReduceSumTest implements NSReduceFunction { - final LinkedBlockingDeque>> mapResultsQueue; - int nRead = 0; - int lastValue = -1; - - public ReduceSumTest(LinkedBlockingDeque>> mapResultsQueue) { - this.mapResultsQueue = mapResultsQueue; - } - - @Override public Integer apply(Integer one, Integer sum) { - Assert.assertTrue(lastValue < one, "Reduce came in out of order. Prev " + lastValue + " cur " + one); - - Assert.assertTrue(lastValue < one, "Read values coming out of order!"); - final int expected = lastValue + 1; - Assert.assertEquals((int)one, expected, "Value observed " + one + " not equal to the expected value " + expected); - nRead++; - lastValue = expected; - - return one + sum; - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java deleted file mode 100644 index 74626d031..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java +++ /dev/null @@ -1,161 +0,0 @@ -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; - -import org.broadinstitute.sting.BaseTest; - -public class ArtificialSingleSampleReadStreamUnitTest extends BaseTest { - - private static class ArtificialSingleSampleReadStreamTest extends TestDataProvider { - private ArtificialSingleSampleReadStream stream; - private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; - - public ArtificialSingleSampleReadStreamTest( ArtificialSingleSampleReadStream stream ) { - super(ArtificialSingleSampleReadStreamTest.class); - - this.stream = stream; - - setName(String.format("%s: numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", - getClass().getSimpleName(), - stream.getNumContigs(), - stream.getNumStacksPerContig(), - stream.getMinReadsPerStack(), - stream.getMaxReadsPerStack(), - stream.getMinDistanceBetweenStacks(), - stream.getMaxDistanceBetweenStacks(), - stream.getMinReadLength(), - stream.getMaxReadLength(), - stream.getNumUnmappedReads())); - } - - public void run() { - streamAnalyzer= new ArtificialSingleSampleReadStreamAnalyzer(stream); - - streamAnalyzer.analyze(stream); - - // Check whether the observed properties of the stream match its nominal properties - streamAnalyzer.validate(); - } - } - - @DataProvider(name = "ArtificialSingleSampleReadStreamTestDataProvider") - public Object[][] createArtificialSingleSampleReadStreamTests() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); - String readGroupID = "testReadGroup"; - SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); - readGroup.setSample("testSample"); - header.addReadGroup(readGroup); - - GenomeAnalysisEngine.resetRandomGenerator(); - - // brute force testing! - for ( int numContigs = 0; numContigs <= 2; numContigs++ ) { - for ( int stacksPerContig = 0; stacksPerContig <= 2; stacksPerContig++ ) { - for ( int minReadsPerStack = 1; minReadsPerStack <= 2; minReadsPerStack++ ) { - for ( int maxReadsPerStack = 1; maxReadsPerStack <= 3; maxReadsPerStack++ ) { - for ( int minDistanceBetweenStacks = 1; minDistanceBetweenStacks <= 2; minDistanceBetweenStacks++ ) { - for ( int maxDistanceBetweenStacks = 1; maxDistanceBetweenStacks <= 3; maxDistanceBetweenStacks++ ) { - for ( int minReadLength = 1; minReadLength <= 2; minReadLength++ ) { - for ( int maxReadLength = 1; maxReadLength <= 3; maxReadLength++ ) { - for ( int numUnmappedReads = 0; numUnmappedReads <= 2; numUnmappedReads++ ) { - // Only test sane combinations here - if ( minReadsPerStack <= maxReadsPerStack && - minDistanceBetweenStacks <= maxDistanceBetweenStacks && - minReadLength <= maxReadLength && - ((numContigs > 0 && stacksPerContig > 0) || (numContigs == 0 && stacksPerContig == 0)) ) { - - new ArtificialSingleSampleReadStreamTest(new ArtificialSingleSampleReadStream(header, - readGroupID, - numContigs, - stacksPerContig, - minReadsPerStack, - maxReadsPerStack, - minDistanceBetweenStacks, - maxDistanceBetweenStacks, - minReadLength, - maxReadLength, - numUnmappedReads)); - } - } - } - } - } - } - } - } - } - } - - return ArtificialSingleSampleReadStreamTest.getTests(ArtificialSingleSampleReadStreamTest.class); - } - - @Test(dataProvider = "ArtificialSingleSampleReadStreamTestDataProvider") - public void testArtificialSingleSampleReadStream( ArtificialSingleSampleReadStreamTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - test.run(); - } - - @DataProvider(name = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider") - public Object[][] createInvalidArgumentsTests() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); - String readGroupID = "testReadGroup"; - header.addReadGroup(new SAMReadGroupRecord(readGroupID)); - - return new Object[][] { - {"testNullHeader", null, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testNullReadGroup", header, null, 1, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testInvalidReadGroup", header, "foo", 1, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testInvalidNumContigs", header, readGroupID, -1, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testInvalidNumStacksPerContig", header, readGroupID, 1, -1, 1, 2, 1, 2, 1, 2, 0}, - {"test0ContigsNon0StacksPerContig", header, readGroupID, 0, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testNon0Contigs0StacksPerContig", header, readGroupID, 1, 0, 1, 2, 1, 2, 1, 2, 0}, - {"testInvalidMinReadsPerStack", header, readGroupID, 1, 1, -1, 2, 1, 2, 1, 2, 0}, - {"testInvalidMaxReadsPerStack", header, readGroupID, 1, 1, 1, -2, 1, 2, 1, 2, 0}, - {"testInvalidMinDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, -1, 2, 1, 2, 0}, - {"testInvalidMaxDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, 1, -2, 1, 2, 0}, - {"testInvalidMinReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, -1, 2, 0}, - {"testInvalidMaxReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, -2, 0}, - {"testInvalidReadsPerStackRange", header, readGroupID, 1, 1, 2, 1, 1, 2, 1, 2, 0}, - {"testInvalidDistanceBetweenStacksRange", header, readGroupID, 1, 1, 1, 2, 2, 1, 1, 2, 0}, - {"testInvalidReadLengthRange", header, readGroupID, 1, 1, 1, 2, 1, 2, 2, 1, 0}, - {"testInvalidNumUnmappedReads", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, -1}, - }; - } - - @Test(dataProvider = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider", - expectedExceptions = ReviewedStingException.class) - public void testInvalidArguments( String testName, - SAMFileHeader header, - String readGroupID, - int numContigs, - int numStacksPerContig, - int minReadsPerStack, - int maxReadsPerStack, - int minDistanceBetweenStacks, - int maxDistanceBetweenStacks, - int minReadLength, - int maxReadLength, - int numUnmappedReads ) { - - logger.warn("Running test: " + testName); - - ArtificialSingleSampleReadStream stream = new ArtificialSingleSampleReadStream(header, - readGroupID, - numContigs, - numStacksPerContig, - minReadsPerStack, - maxReadsPerStack, - minDistanceBetweenStacks, - maxDistanceBetweenStacks, - minReadLength, - maxReadLength, - numUnmappedReads); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java deleted file mode 100755 index 7381bebc4..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package org.broadinstitute.sting.utils.threading; - -import org.apache.log4j.Priority; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; - -/** - * Tests for the state monitoring thread factory. - */ -public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { - // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing - private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 100000; - private final static int MAX_THREADS = 4; - final static Object GLOBAL_LOCK = new Object(); - - private class StateTest extends TestDataProvider { - private final double TOLERANCE = 0.1; // willing to tolerate a 10% error - - final List statesForThreads; - - public StateTest(final List statesForThreads) { - super(StateTest.class); - this.statesForThreads = statesForThreads; - setName("StateTest " + Utils.join(",", statesForThreads)); - } - - public List getStatesForThreads() { - return statesForThreads; - } - - public int getNStates() { return statesForThreads.size(); } - - public double maxStatePercent(final EfficiencyMonitoringThreadFactory.State state) { return 100*(fraction(state) + TOLERANCE); } - public double minStatePercent(final EfficiencyMonitoringThreadFactory.State state) { return 100*(fraction(state) - TOLERANCE); } - - private double fraction(final EfficiencyMonitoringThreadFactory.State state) { - return Collections.frequency(statesForThreads, state) / (1.0 * statesForThreads.size()); - } - } - - /** - * Test helper threading class that puts the thread into RUNNING, BLOCKED, or WAITING state as - * requested for input argument - */ - private static class StateTestThread implements Callable { - private final EfficiencyMonitoringThreadFactory.State stateToImplement; - - private StateTestThread(final EfficiencyMonitoringThreadFactory.State stateToImplement) { - this.stateToImplement = stateToImplement; - } - - @Override - public Double call() throws Exception { - switch ( stateToImplement ) { - case USER_CPU: - // do some work until we get to THREAD_TARGET_DURATION_IN_MILLISECOND - double sum = 0.0; - final long startTime = System.currentTimeMillis(); - for ( int i = 1; System.currentTimeMillis() - startTime < (THREAD_TARGET_DURATION_IN_MILLISECOND - 1); i++ ) { - sum += Math.log10(i); - } - return sum; - case WAITING: - Thread.currentThread().sleep(THREAD_TARGET_DURATION_IN_MILLISECOND); - return 0.0; - case BLOCKING: - if ( EfficiencyMonitoringThreadFactory.DEBUG ) logger.warn("Blocking..."); - synchronized (GLOBAL_LOCK) { - // the GLOBAL_LOCK must be held by the unit test itself for this to properly block - if ( EfficiencyMonitoringThreadFactory.DEBUG ) logger.warn(" ... done blocking"); - } - return 0.0; - case WAITING_FOR_IO: - // TODO -- implement me - // shouldn't ever get here, throw an exception - throw new ReviewedStingException("WAITING_FOR_IO testing currently not implemented, until we figure out how to force a system call block"); - default: - throw new ReviewedStingException("Unexpected thread test state " + stateToImplement); - } - } - } - - @DataProvider(name = "StateTest") - public Object[][] createStateTest() { - for ( final int nThreads : Arrays.asList(3) ) { - //final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.WAITING_FOR_IO); - final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.USER_CPU, EfficiencyMonitoringThreadFactory.State.WAITING, EfficiencyMonitoringThreadFactory.State.BLOCKING); - //final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.values()); - for (final List states : Utils.makePermutations(allStates, nThreads, true) ) { - //if ( Collections.frequency(states, Thread.State.BLOCKED) > 0) - new StateTest(states); - } - } - - return StateTest.getTests(StateTest.class); - } - - @Test(enabled = true, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) - public void testStateTest(final StateTest test) throws InterruptedException { - // allows us to test blocking - final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates()); - final ExecutorService threadPool = Executors.newFixedThreadPool(test.getNStates(), factory); - - logger.warn("Running " + test); - synchronized (GLOBAL_LOCK) { - //logger.warn(" Have lock"); - for ( final EfficiencyMonitoringThreadFactory.State threadToRunState : test.getStatesForThreads() ) - threadPool.submit(new StateTestThread(threadToRunState)); - - // lock has to be here for the whole running of the activeThreads but end before the sleep so the blocked activeThreads - // can block for their allotted time - threadPool.shutdown(); - Thread.sleep(THREAD_TARGET_DURATION_IN_MILLISECOND); - } - //logger.warn(" Releasing lock"); - threadPool.awaitTermination(10, TimeUnit.SECONDS); - //logger.warn(" done awaiting termination"); - //logger.warn(" waiting for all activeThreads to complete"); - factory.waitForAllThreadsToComplete(); - //logger.warn(" done waiting for activeThreads"); - - // make sure we counted everything properly - final long totalTime = factory.getTotalTime(); - final long minTime = (long)(THREAD_TARGET_DURATION_IN_MILLISECOND * 0.5) * test.getNStates(); - final long maxTime = (long)(THREAD_TARGET_DURATION_IN_MILLISECOND * 1.5) * test.getNStates(); - //logger.warn("Testing total time"); - Assert.assertTrue(totalTime >= minTime, "Factory results not properly accumulated: totalTime = " + totalTime + " < minTime = " + minTime); - Assert.assertTrue(totalTime <= maxTime, "Factory results not properly accumulated: totalTime = " + totalTime + " > maxTime = " + maxTime); - - for (final EfficiencyMonitoringThreadFactory.State state : EfficiencyMonitoringThreadFactory.State.values() ) { - final double min = test.minStatePercent(state); - final double max = test.maxStatePercent(state); - final double obs = factory.getStatePercent(state); -// logger.warn(" Checking " + state -// + " min " + String.format("%.2f", min) -// + " max " + String.format("%.2f", max) -// + " obs " + String.format("%.2f", obs) -// + " factor = " + factory); - Assert.assertTrue(obs >= min, "Too little time spent in state " + state + " obs " + obs + " min " + min); - Assert.assertTrue(obs <= max, "Too much time spent in state " + state + " obs " + obs + " max " + min); - } - - // we actually ran the expected number of activeThreads - Assert.assertEquals(factory.getNThreadsCreated(), test.getNStates()); - - // should be called to ensure we don't format / NPE on output - factory.printUsageInformation(logger, Priority.WARN); - } -} \ No newline at end of file From d2f3d6d22ff72ef29c98ce0591092469c33ca42e Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 10 Sep 2012 15:52:39 -0400 Subject: [PATCH 05/19] Revert "Separated out the DoC calculations from the XHMM pipeline, so that CalcDepthOfCoverage can be used for calculating joint coverage on a per-base accounting over multiple samples (e.g., family samples)" This reverts commit 075c56060e0ffcce39631693ef39cf5f8c3a4d5a. --- .../IntervalOverlappingRODsFromStream.java | 143 ++++ .../gatk/downsampling/DownsampleType.java | 14 + .../gatk/downsampling/DownsamplingMethod.java | 153 +++++ .../FractionalDownsamplerFactory.java | 45 ++ .../downsampling/LevelingDownsampler.java | 212 ++++++ .../PerSampleDownsamplingReadsIterator.java | 202 ++++++ .../downsampling/ReadsDownsamplerFactory.java | 37 + .../ReservoirDownsamplerFactory.java | 45 ++ .../SimplePositionalDownsampler.java | 169 +++++ .../SimplePositionalDownsamplerFactory.java | 45 ++ .../iterators/LegacyDownsampleIterator.java | 52 ++ .../LocusIteratorByStateExperimental.java | 649 ++++++++++++++++++ .../sting/gatk/iterators/ReadTransformer.java | 144 ++++ .../gatk/iterators/ReadTransformersMode.java | 28 + .../sting/gatk/samples/Trio.java | 45 ++ .../gatk/traversals/TraverseLociBase.java | 103 +++ .../gatk/traversals/TraverseLociLinear.java | 47 ++ .../gatk/traversals/TraverseLociNano.java | 205 ++++++ .../gatk/traversals/TraverseReadsNano.java | 234 +++++++ .../sting/gatk/walkers/NanoSchedulable.java | 31 + .../fasta/FastaAlternateReferenceMaker.java | 133 ++++ .../walkers/fasta/FastaReferenceMaker.java | 127 ++++ .../sting/utils/baq/BAQReadTransformer.java | 49 ++ .../utils/baq/ReadTransformingIterator.java | 44 ++ .../nanoScheduler/BlockingQueueValue.java | 82 +++ .../utils/nanoScheduler/FutureValue.java | 45 ++ .../utils/nanoScheduler/InputProducer.java | 62 ++ .../sting/utils/nanoScheduler/MapResult.java | 36 + .../utils/nanoScheduler/NSMapFunction.java | 19 + .../nanoScheduler/NSProgressFunction.java | 12 + .../utils/nanoScheduler/NSReduceFunction.java | 18 + .../utils/nanoScheduler/NanoScheduler.java | 392 +++++++++++ .../utils/nanoScheduler/ReducerThread.java | 65 ++ .../sting/utils/recalibration/BQSRMode.java | 30 + .../recalibration/BQSRReadTransformer.java | 40 ++ .../sam/ArtificialMultiSampleReadStream.java | 86 +++ .../sam/ArtificialSingleSampleReadStream.java | 212 ++++++ ...ificialSingleSampleReadStreamAnalyzer.java | 281 ++++++++ .../EfficiencyMonitoringThreadFactory.java | 158 +++++ .../utils/threading/NamedThreadFactory.java | 26 + .../threading/ThreadEfficiencyMonitor.java | 207 ++++++ .../InvalidArgumentIntegrationTest.java | 41 ++ .../LevelingDownsamplerUnitTest.java | 163 +++++ ...mpleDownsamplingReadsIteratorUnitTest.java | 298 ++++++++ ...ificialSingleSampleReadStreamAnalyzer.java | 126 ++++ .../ReservoirDownsamplerUnitTest.java | 129 ++++ .../SimplePositionalDownsamplerUnitTest.java | 330 +++++++++ ...usIteratorByStateExperimentalUnitTest.java | 546 +++++++++++++++ .../LegacyReservoirDownsamplerUnitTest.java | 166 +++++ .../nanoScheduler/InputProducerUnitTest.java | 71 ++ .../nanoScheduler/NanoSchedulerUnitTest.java | 182 +++++ .../nanoScheduler/ReducerThreadUnitTest.java | 94 +++ ...ificialSingleSampleReadStreamUnitTest.java | 161 +++++ ...ciencyMonitoringThreadFactoryUnitTest.java | 184 +++++ 54 files changed, 7218 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java create mode 100644 public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java new file mode 100644 index 000000000..1e39d6836 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java @@ -0,0 +1,143 @@ +package org.broadinstitute.sting.gatk.datasources.providers; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.ListIterator; + +/** + * Key algorithmic helper for ReadBasedReferenceOrderedData + * + * Takes a single iterator of features, and provides a single capability that returns + * the list of RODs that overlap an interval. Allows sequential getOverlapping calls + * from intervals provided that these intervals always have increasing getStart() values. + * + */ +class IntervalOverlappingRODsFromStream { + /** + * Only held for QC purposes + */ + GenomeLoc lastQuery = null; + + private final String name; + private final LinkedList currentFeatures = new LinkedList(); + private final PeekableIterator futureFeatures; + + /** + * Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and + * returns RODRecordLists having name + * + * @param name + * @param futureFeatures + */ + IntervalOverlappingRODsFromStream(final String name, final PeekableIterator futureFeatures) { + if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null"); + + this.name = name; + this.futureFeatures = futureFeatures; + } + + /** + * Get the list of RODs overlapping loc from this stream of RODs. + * + * Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart + * + * @param loc the interval to query + * @return a non-null RODRecordList containing the overlapping RODs, which may be empty + */ + @Ensures({"overlaps(loc, result)", + "! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)", + "result != null"}) + public RODRecordList getOverlapping(final GenomeLoc loc) { + if ( lastQuery != null && loc.getStart() < lastQuery.getStart() ) + throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery)); + + trimCurrentFeaturesToLoc(loc); + readOverlappingFutureFeatures(loc); + return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc); + } + + + /** + * For contract assurance. Checks that all bindings in loc overlap + * + * @param loc + * @param bindings + * @return + */ + @Requires({"loc != null", "bindings != null"}) + private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) { + for ( final GATKFeature feature : bindings ) + if ( ! feature.getLocation().overlapsP(loc) ) + return false; + return true; + } + + /** + * Subset the features in all to those that overlap with loc + * + * The current features list contains everything read that cannot be thrown away yet, but not + * everything in there necessarily overlaps with loc. Subset to just those that do overlap + * + * @param loc the location that features must overlap + * @param all the list of all features + * @return a subset of all that overlaps with loc + */ + @Requires({"loc != null", "all != null"}) + @Ensures("result.size() <= all.size()") + private Collection subsetToOverlapping(final GenomeLoc loc, final Collection all) { + final LinkedList overlapping = new LinkedList(); + for ( final GATKFeature feature : all ) + if ( feature.getLocation().overlapsP(loc) ) + overlapping.add(feature); + return overlapping; + } + + /** + * Update function. Remove all elements of currentFeatures that end before loc + * + * @param loc the location to use + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() <= old(currentFeatures.size())") + private void trimCurrentFeaturesToLoc(final GenomeLoc loc) { + final ListIterator it = currentFeatures.listIterator(); + while ( it.hasNext() ) { + final GATKFeature feature = it.next(); + if ( feature.getLocation().isBefore(loc) ) + it.remove(); + } + } + + /** + * Update function: Read all elements from futureFeatures that overlap with loc + * + * Stops at the first element that starts before the end of loc, or the stream empties + * + * @param loc + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() >= old(currentFeatures.size())") + private void readOverlappingFutureFeatures(final GenomeLoc loc) { + while ( futureFeatures.hasNext() ) { + final GenomeLoc nextLoc = futureFeatures.peek().getLocation(); + if ( nextLoc.isBefore(loc) ) { + futureFeatures.next(); // next rod element is before loc, throw it away and keep looking + } else if ( nextLoc.isPast(loc) ) { + break; // next element is past loc, stop looking but don't pop it + } else if ( nextLoc.overlapsP(loc) ) { + // add overlapping elements to our current features, removing from stream + for ( final GATKFeature feature : futureFeatures.next() ) { + currentFeatures.add(feature); + } + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java new file mode 100644 index 000000000..c3d17436a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java @@ -0,0 +1,14 @@ +package org.broadinstitute.sting.gatk.downsampling; + +/** + * Type of downsampling method to invoke. + * + * @author hanna + * @version 0.1 + */ + +public enum DownsampleType { + NONE, + ALL_READS, + BY_SAMPLE +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java new file mode 100644 index 000000000..ae1d98ce0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Describes the method for downsampling reads at a given locus. + */ + +public class DownsamplingMethod { + /** + * Type of downsampling to perform. + */ + public final DownsampleType type; + + /** + * Actual downsampling target is specified as an integer number of reads. + */ + public final Integer toCoverage; + + /** + * Actual downsampling target is specified as a fraction of total available reads. + */ + public final Double toFraction; + + /** + * Use the new experimental downsampling? + */ + public final boolean useExperimentalDownsampling; + + /** + * Expresses no downsampling applied at all. + */ + public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false); + + /** + * Default type to use if no type is specified + */ + public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; + + /** + * Default target coverage for locus-based traversals + */ + public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000; + + public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) { + this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; + this.toCoverage = toCoverage; + this.toFraction = toFraction; + this.useExperimentalDownsampling = useExperimentalDownsampling; + + if ( type == DownsampleType.NONE ) { + toCoverage = null; + toFraction = null; + } + + validate(); + } + + private void validate() { + // Can't leave toFraction and toCoverage null unless type is NONE + if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null ) + throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling."); + + // Fraction and coverage cannot both be specified. + if ( toFraction != null && toCoverage != null ) + throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one."); + + // toCoverage must be > 0 when specified + if ( toCoverage != null && toCoverage <= 0 ) { + throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage"); + } + + // toFraction must be >= 0.0 and <= 1.0 when specified + if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { + throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); + } + + // Some restrictions only exist for the old downsampling implementation: + if ( ! useExperimentalDownsampling ) { + // By sample downsampling does not work with a fraction of reads in the old downsampling implementation + if( type == DownsampleType.BY_SAMPLE && toFraction != null ) + throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method"); + } + + // Some restrictions only exist for the new downsampling implementation: + if ( useExperimentalDownsampling ) { + if ( type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation"); + } + } + } + + public String toString() { + StringBuilder builder = new StringBuilder("Downsampling Settings: "); + + if ( type == DownsampleType.NONE ) { + builder.append("No downsampling"); + } + else { + builder.append(String.format("Method: %s ", type)); + + if ( toCoverage != null ) { + builder.append(String.format("Target Coverage: %d ", toCoverage)); + } + else { + builder.append(String.format("Target Fraction: %.2f ", toFraction)); + } + + if ( useExperimentalDownsampling ) { + builder.append("Using Experimental Downsampling"); + } + } + + return builder.toString(); + } + + public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) { + if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) { + return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE, + null, useExperimentalDownsampling); + } + else { + return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java new file mode 100644 index 000000000..7a7c9e91e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating FractionalDownsamplers on demand + * + * @author David Roazen + */ +public class FractionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private double fraction; + + public FractionalDownsamplerFactory( double fraction ) { + this.fraction = fraction; + } + + public ReadsDownsampler newInstance() { + return new FractionalDownsampler(fraction); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java new file mode 100644 index 000000000..73d69140d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.*; + +/** + * Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from + * the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling + * does not occur until all Lists have been submitted and signalEndOfInput() is called. + * + * The Lists should be LinkedLists for maximum efficiency during item removal, however other + * kinds of Lists are also accepted (albeit at a slight performance penalty). + * + * Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface, + * the Lists need not contain reads. However this downsampler may not be wrapped within one of the + * DownsamplingReadsIterators + * + * @param the List type representing the stacks to be leveled + * @param the type of the elements of each List + * + * @author David Roazen + */ +public class LevelingDownsampler, E> implements Downsampler { + + private int targetSize; + + private List groups; + + private boolean groupsAreFinalized; + + private int numDiscardedItems; + + /** + * Construct a LevelingDownsampler + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + */ + public LevelingDownsampler( int targetSize ) { + this.targetSize = targetSize; + clear(); + reset(); + } + + public void submit( T item ) { + groups.add(item); + } + + public void submit( Collection items ){ + groups.addAll(items); + } + + public boolean hasFinalizedItems() { + return groupsAreFinalized && groups.size() > 0; + } + + public List consumeFinalizedItems() { + if ( ! hasFinalizedItems() ) { + return new ArrayList(); + } + + // pass by reference rather than make a copy, for speed + List toReturn = groups; + clear(); + return toReturn; + } + + public boolean hasPendingItems() { + return ! groupsAreFinalized && groups.size() > 0; + } + + public T peekFinalized() { + return hasFinalizedItems() ? groups.get(0) : null; + } + + public T peekPending() { + return hasPendingItems() ? groups.get(0) : null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + public void signalEndOfInput() { + levelGroups(); + groupsAreFinalized = true; + } + + public void clear() { + groups = new ArrayList(); + groupsAreFinalized = false; + } + + public void reset() { + numDiscardedItems = 0; + } + + private void levelGroups() { + int totalSize = 0; + int[] groupSizes = new int[groups.size()]; + int currentGroupIndex = 0; + + for ( T group : groups ) { + groupSizes[currentGroupIndex] = group.size(); + totalSize += groupSizes[currentGroupIndex]; + currentGroupIndex++; + } + + if ( totalSize <= targetSize ) { + return; // no need to eliminate any items + } + + // We will try to remove exactly this many items, however we will refuse to allow any + // one group to fall below size 1, and so might end up removing fewer items than this + int numItemsToRemove = totalSize - targetSize; + + currentGroupIndex = 0; + int numConsecutiveUmodifiableGroups = 0; + + // Continue until we've either removed all the items we wanted to, or we can't + // remove any more items without violating the constraint that all groups must + // be left with at least one item + while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { + if ( groupSizes[currentGroupIndex] > 1 ) { + groupSizes[currentGroupIndex]--; + numItemsToRemove--; + numConsecutiveUmodifiableGroups = 0; + } + else { + numConsecutiveUmodifiableGroups++; + } + + currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length; + } + + // Now we actually go through and reduce each group to its new count as specified in groupSizes + currentGroupIndex = 0; + for ( T group : groups ) { + downsampleOneGroup(group, groupSizes[currentGroupIndex]); + currentGroupIndex++; + } + } + + private void downsampleOneGroup( T group, int numItemsToKeep ) { + if ( numItemsToKeep >= group.size() ) { + return; + } + + numDiscardedItems += group.size() - numItemsToKeep; + + BitSet itemsToKeep = new BitSet(group.size()); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) { + itemsToKeep.set(selectedIndex); + } + + int currentIndex = 0; + + // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator + if ( group instanceof LinkedList ) { + Iterator iter = group.iterator(); + while ( iter.hasNext() ) { + iter.next(); + + if ( ! itemsToKeep.get(currentIndex) ) { + iter.remove(); + } + + currentIndex++; + } + } + // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather + // than suffer O(n^2) of item shifting + else { + List keptItems = new ArrayList(numItemsToKeep); + + for ( E item : group ) { + if ( itemsToKeep.get(currentIndex) ) { + keptItems.add(item); + } + currentIndex++; + } + group.clear(); + group.addAll(keptItems); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java new file mode 100644 index 000000000..8b2034460 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMRecordComparator; +import net.sf.samtools.SAMRecordCoordinateComparator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; + +import java.util.*; + + +/** + * StingSAMIterator wrapper around our generic reads downsampler interface + * that downsamples reads for each sample independently, and then re-assembles + * the reads back into a single merged stream. + * + * @author David Roazen + */ +public class PerSampleDownsamplingReadsIterator implements StingSAMIterator { + + private StingSAMIterator nestedSAMIterator; + private ReadsDownsamplerFactory downsamplerFactory; + private Map> perSampleDownsamplers; + private PriorityQueue orderedDownsampledReadsCache; + private SAMRecord nextRead = null; + private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator(); + private SAMRecord earliestPendingRead = null; + private ReadsDownsampler earliestPendingDownsampler = null; + + // Initial size of our cache of finalized reads + private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096; + + // The number of positional changes that can occur in the read stream before all downsamplers + // should be informed of the current position (guards against samples with relatively sparse reads + // getting stuck in a pending state): + private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value + + /** + * @param iter wrapped iterator from which this iterator will pull reads + * @param downsamplerFactory factory used to create new downsamplers as needed + */ + public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory downsamplerFactory ) { + nestedSAMIterator = iter; + this.downsamplerFactory = downsamplerFactory; + perSampleDownsamplers = new HashMap>(); + orderedDownsampledReadsCache = new PriorityQueue(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator); + + advanceToNextRead(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if ( nextRead == null ) { + throw new NoSuchElementException("next() called when there are no more items"); + } + + SAMRecord toReturn = nextRead; + advanceToNextRead(); + + return toReturn; + } + + private void advanceToNextRead() { + if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { + nextRead = null; + } + else { + nextRead = orderedDownsampledReadsCache.poll(); + } + } + + private boolean readyToReleaseReads() { + if ( orderedDownsampledReadsCache.isEmpty() ) { + return false; + } + + return earliestPendingRead == null || + readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0; + } + + private void updateEarliestPendingRead( ReadsDownsampler currentDownsampler ) { + // If there is no recorded earliest pending read and this downsampler has pending items, + // then this downsampler's first pending item becomes the new earliest pending read: + if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) { + earliestPendingRead = currentDownsampler.peekPending(); + earliestPendingDownsampler = currentDownsampler; + } + // In all other cases, we only need to update the earliest pending read when the downsampler + // associated with it experiences a change in its pending reads, since by assuming a sorted + // read stream we're assured that each downsampler's earliest pending read will only increase + // in genomic position over time. + // + // TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers + // TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))), + // TODO: but need to verify this empirically. + else if ( currentDownsampler == earliestPendingDownsampler && + (! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) { + + earliestPendingRead = null; + earliestPendingDownsampler = null; + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + if ( perSampleDownsampler.hasPendingItems() && + (earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) { + + earliestPendingRead = perSampleDownsampler.peekPending(); + earliestPendingDownsampler = perSampleDownsampler; + } + } + } + } + + private boolean fillDownsampledReadsCache() { + SAMRecord prevRead = null; + int numPositionalChanges = 0; + + // Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue + // can be released without violating global sort order + while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) { + SAMRecord read = nestedSAMIterator.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ReadsDownsampler thisSampleDownsampler = perSampleDownsamplers.get(sampleName); + if ( thisSampleDownsampler == null ) { + thisSampleDownsampler = downsamplerFactory.newInstance(); + perSampleDownsamplers.put(sampleName, thisSampleDownsampler); + } + + thisSampleDownsampler.submit(read); + updateEarliestPendingRead(thisSampleDownsampler); + + if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) { + numPositionalChanges++; + } + + // If the number of times we've changed position exceeds a certain threshold, inform all + // downsamplers of the current position in the read stream. This is to prevent downsamplers + // for samples with sparser reads than others from getting stuck too long in a pending state. + if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalNoMoreReadsBefore(read); + updateEarliestPendingRead(perSampleDownsampler); + } + } + + prevRead = read; + } + + if ( ! nestedSAMIterator.hasNext() ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalEndOfInput(); + } + earliestPendingRead = null; + earliestPendingDownsampler = null; + } + + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + if ( perSampleDownsampler.hasFinalizedItems() ) { + orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems()); + } + } + + return readyToReleaseReads(); + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + nestedSAMIterator.close(); + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java new file mode 100644 index 000000000..2fa32497b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular + * downsampler, all sharing the same construction parameters. + * + * @author David Roazen + */ +public interface ReadsDownsamplerFactory { + public ReadsDownsampler newInstance(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java new file mode 100644 index 000000000..040f0c788 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating ReservoirDownsamplers on demand + * + * @author David Roazen + */ +public class ReservoirDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetSampleSize; + + public ReservoirDownsamplerFactory( int targetSampleSize ) { + this.targetSampleSize = targetSampleSize; + } + + public ReadsDownsampler newInstance() { + return new ReservoirDownsampler(targetSampleSize); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java new file mode 100644 index 000000000..30affc2b3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +import java.util.*; + +/** + * Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage + * using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time. + * + * @author David Roazen + */ +public class SimplePositionalDownsampler implements ReadsDownsampler { + + private int targetCoverage; + + private ReservoirDownsampler reservoir; + + private int currentContigIndex; + + private int currentAlignmentStart; + + private boolean positionEstablished; + + private boolean unmappedReadsReached; + + private ArrayList finalizedReads; + + private int numDiscardedItems; + + /** + * Construct a SimplePositionalDownsampler + * + * @param targetCoverage Maximum number of reads that may share any given alignment start position + */ + public SimplePositionalDownsampler( int targetCoverage ) { + this.targetCoverage = targetCoverage; + reservoir = new ReservoirDownsampler(targetCoverage); + finalizedReads = new ArrayList(); + clear(); + reset(); + } + + public void submit( T newRead ) { + updatePositionalState(newRead); + + if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream + finalizedReads.add(newRead); + } + else { + int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); + reservoir.submit(newRead); + numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems; + } + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasFinalizedItems() { + return finalizedReads.size() > 0; + } + + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + List toReturn = finalizedReads; + finalizedReads = new ArrayList(); + return toReturn; + } + + public boolean hasPendingItems() { + return reservoir.hasFinalizedItems(); + } + + public T peekFinalized() { + return finalizedReads.isEmpty() ? null : finalizedReads.get(0); + } + + public T peekPending() { + return reservoir.peekFinalized(); + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + public void signalEndOfInput() { + finalizeReservoir(); + } + + public void clear() { + reservoir.clear(); + reservoir.reset(); + finalizedReads.clear(); + positionEstablished = false; + unmappedReadsReached = false; + } + + public void reset() { + numDiscardedItems = 0; + } + + public boolean requiresCoordinateSortOrder() { + return true; + } + + public void signalNoMoreReadsBefore( T read ) { + updatePositionalState(read); + } + + private void updatePositionalState( T newRead ) { + if ( readIsPastCurrentPosition(newRead) ) { + if ( reservoir.hasFinalizedItems() ) { + finalizeReservoir(); + } + + setCurrentPosition(newRead); + + if ( newRead.getReadUnmappedFlag() ) { + unmappedReadsReached = true; + } + } + } + + private void setCurrentPosition( T read ) { + currentContigIndex = read.getReferenceIndex(); + currentAlignmentStart = read.getAlignmentStart(); + positionEstablished = true; + } + + private boolean readIsPastCurrentPosition( T read ) { + return ! positionEstablished || + read.getReferenceIndex() > currentContigIndex || + read.getAlignmentStart() > currentAlignmentStart || + (read.getReadUnmappedFlag() && ! unmappedReadsReached); + } + + private void finalizeReservoir() { + finalizedReads.addAll(reservoir.consumeFinalizedItems()); + reservoir.reset(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java new file mode 100644 index 000000000..fcc18b16b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating SimplePositionalDownsamplers on demand + * + * @author David Roazen + */ +public class SimplePositionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetCoverage; + + public SimplePositionalDownsamplerFactory( int targetCoverage ) { + this.targetCoverage = targetCoverage; + } + + public ReadsDownsampler newInstance() { + return new SimplePositionalDownsampler(targetCoverage); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java new file mode 100755 index 000000000..c0de06b49 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java @@ -0,0 +1,52 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; + +import java.util.Iterator; + + +public class LegacyDownsampleIterator implements StingSAMIterator { + + StingSAMIterator it; + int cutoff; + SAMRecord next; + + public LegacyDownsampleIterator(StingSAMIterator it, double fraction) { + this.it = it; + cutoff = (int)(fraction * 10000); + next = getNextRecord(); + } + + public boolean hasNext() { + return next != null; + } + + public SAMRecord next() { + SAMRecord result = next; + next = getNextRecord(); + return result; + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + private SAMRecord getNextRecord() { + while ( true ) { + if ( !it.hasNext() ) + return null; + SAMRecord rec = it.next(); + if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoff ) + return rec; + } + } + + public void close() { + it.close(); + } + + public Iterator iterator() { + return this; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java new file mode 100755 index 000000000..557cbd009 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java @@ -0,0 +1,649 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.Downsampler; +import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class LocusIteratorByStateExperimental extends LocusIterator { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Used to create new GenomeLocs. + */ + private final GenomeLocParser genomeLocParser; + private final ArrayList samples; + private final ReadStateManager readStates; + + protected static class SAMRecordState { + SAMRecord read; + int readOffset = -1; // how far are we offset from the start of the read bases? + int genomeOffset = -1; // how far are we offset from the alignment start on the genome? + + Cigar cigar = null; + int cigarOffset = -1; + CigarElement curElement = null; + int nCigarElements = 0; + + int cigarElementCounter = -1; // how far are we into a single cigarElement + + // The logical model for generating extended events is as follows: the "record state" implements the traversal + // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This + // can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the + // deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or + // if the deletion just started *right before* the current reference base the record state is pointing to upon the return from + // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended + // events immediately preceding the current reference base). + + public SAMRecordState(SAMRecord read) { + this.read = read; + cigar = read.getCigar(); + nCigarElements = cigar.numCigarElements(); + + //System.out.printf("Creating a SAMRecordState: %s%n", this); + } + + public SAMRecord getRead() { + return read; + } + + /** + * What is our current offset in the read's bases that aligns us with the reference genome? + * + * @return + */ + public int getReadOffset() { + return readOffset; + } + + /** + * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? + * + * @return + */ + public int getGenomeOffset() { + return genomeOffset; + } + + public int getGenomePosition() { + return read.getAlignmentStart() + getGenomeOffset(); + } + + public GenomeLoc getLocation(GenomeLocParser genomeLocParser) { + return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition()); + } + + public CigarOperator getCurrentCigarOperator() { + return curElement.getOperator(); + } + + public String toString() { + return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); + } + + public CigarElement peekForwardOnGenome() { + return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ); + } + + public CigarElement peekBackwardOnGenome() { + return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement ); + } + + + public CigarOperator stepForwardOnGenome() { + // we enter this method with readOffset = index of the last processed base on the read + // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion + + + if (curElement == null || ++cigarElementCounter > curElement.getLength()) { + cigarOffset++; + if (cigarOffset < nCigarElements) { + curElement = cigar.getCigarElement(cigarOffset); + cigarElementCounter = 0; + // next line: guards against cigar elements of length 0; when new cigar element is retrieved, + // we reenter in order to re-check cigarElementCounter against curElement's length + return stepForwardOnGenome(); + } else { + if (curElement != null && curElement.getOperator() == CigarOperator.D) + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + + // Reads that contain indels model the genomeOffset as the following base in the reference. Because + // we fall into this else block only when indels end the read, increment genomeOffset such that the + // current offset of this read is the next ref base after the end of the indel. This position will + // model a point on the reference somewhere after the end of the read. + genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: + // we do step forward on the ref, and by returning null we also indicate that we are past the read end. + + return null; + } + } + + boolean done = false; + switch (curElement.getOperator()) { + case H: // ignore hard clips + case P: // ignore pads + cigarElementCounter = curElement.getLength(); + break; + case I: // insertion w.r.t. the reference + case S: // soft clip + cigarElementCounter = curElement.getLength(); + readOffset += curElement.getLength(); + break; + case D: // deletion w.r.t. the reference + if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string + throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + // should be the same as N case + genomeOffset++; + done = true; + break; + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + genomeOffset++; + done = true; + break; + case M: + case EQ: + case X: + readOffset++; + genomeOffset++; + done = true; + break; + default: + throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); + } + + return done ? curElement.getOperator() : stepForwardOnGenome(); + } + } + + //final boolean DEBUG = false; + //final boolean DEBUG2 = false && DEBUG; + private ReadProperties readInfo; + private AlignmentContext nextAlignmentContext; + private boolean performLevelingDownsampling; + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + + public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { + this.readInfo = readInformation; + this.genomeLocParser = genomeLocParser; + this.samples = new ArrayList(samples); + this.readStates = new ReadStateManager(samIterator); + + this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null && + readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readInfo.getDownsamplingMethod().toCoverage != null; + + // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when + // there's no read data. So we need to throw this error only when samIterator.hasNext() is true + if (this.samples.isEmpty() && samIterator.hasNext()) { + throw new IllegalArgumentException("samples list must not be empty"); + } + } + + /** + * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list + * for the system. + */ + public final static Collection sampleListForSAMWithoutReadGroups() { + List samples = new ArrayList(); + samples.add(null); + return samples; + } + + public Iterator iterator() { + return this; + } + + public void close() { + //this.it.close(); + } + + public boolean hasNext() { + lazyLoadNextAlignmentContext(); + return (nextAlignmentContext != null); + //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); + } + + private GenomeLoc getLocation() { + return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // next() routine and associated collection operations + // + // ----------------------------------------------------------------------------------------------------------------- + public AlignmentContext next() { + lazyLoadNextAlignmentContext(); + if (!hasNext()) + throw new NoSuchElementException("LocusIteratorByState: out of elements."); + AlignmentContext currentAlignmentContext = nextAlignmentContext; + nextAlignmentContext = null; + return currentAlignmentContext; + } + + /** + * Creates the next alignment context from the given state. Note that this is implemented as a lazy load method. + * nextAlignmentContext MUST BE null in order for this method to advance to the next entry. + */ + private void lazyLoadNextAlignmentContext() { + while (nextAlignmentContext == null && readStates.hasNext()) { + readStates.collectPendingReads(); + + final GenomeLoc location = getLocation(); + final Map fullPileup = new HashMap(); + + // TODO: How can you determine here whether the current pileup has been downsampled? + boolean hasBeenSampled = false; + + for (final String sample : samples) { + final Iterator iterator = readStates.iterator(sample); + final List pile = new ArrayList(readStates.size(sample)); + + int size = 0; // number of elements in this sample's pileup + int nDeletions = 0; // number of deletions in this sample's pileup + int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) + + while (iterator.hasNext()) { + final SAMRecordState state = iterator.next(); // state object with the read/offset information + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element + final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element + final boolean isSingleElementCigar = nextElement == lastElement; + final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator + final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator + int readOffset = state.getReadOffset(); // the base offset on this read + + final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; + final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; + final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; + final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar; + final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); + + int nextElementLength = nextElement.getLength(); + + if (op == CigarOperator.N) // N's are never added to any pileup + continue; + + if (op == CigarOperator.D) { + // TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix + if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so + pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1)); + size++; + nDeletions++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; + } + } + else { + if (!filterBaseInRead(read, location.getStart())) { + String insertedBaseString = null; + if (nextOp == CigarOperator.I) { + final int insertionOffset = isSingleElementCigar ? 0 : 1; + // TODO -- someone please implement a better fix for the single element insertion CIGAR! + if (isSingleElementCigar) + readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases! + insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); + } + + pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); + size++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; + } + } + } + + if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup + fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads)); + } + + updateReadStates(); // critical - must be called after we get the current state offsets and location + if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); + } + } + + // fast testing of position + private boolean readIsPastCurrentPosition(SAMRecord read) { + if (readStates.isEmpty()) + return false; + else { + SAMRecordState state = readStates.getFirst(); + SAMRecord ourRead = state.getRead(); + return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); + } + } + + /** + * Generic place to put per-base filters appropriate to LocusIteratorByState + * + * @param rec + * @param pos + * @return + */ + private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) { + return ReadUtils.isBaseInsideAdaptor(rec, pos); + } + + private void updateReadStates() { + for (final String sample : samples) { + Iterator it = readStates.iterator(sample); + while (it.hasNext()) { + SAMRecordState state = it.next(); + CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + } + } + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + protected class ReadStateManager { + private final PeekableIterator iterator; + private final SamplePartitioner samplePartitioner; + private final Map readStatesBySample = new HashMap(); + private int totalReadStates = 0; + + public ReadStateManager(Iterator source) { + this.iterator = new PeekableIterator(source); + + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + } + + samplePartitioner = new SamplePartitioner(); + } + + /** + * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented + * for this iterator; if present, total read states will be decremented. + * + * @param sample The sample. + * @return Iterator over the reads associated with that sample. + */ + public Iterator iterator(final String sample) { + return new Iterator() { + private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public SAMRecordState next() { + return wrappedIterator.next(); + } + + public void remove() { + wrappedIterator.remove(); + } + }; + } + + public boolean isEmpty() { + return totalReadStates == 0; + } + + /** + * Retrieves the total number of reads in the manager across all samples. + * + * @return Total number of reads over all samples. + */ + public int size() { + return totalReadStates; + } + + /** + * Retrieves the total number of reads in the manager in the given sample. + * + * @param sample The sample. + * @return Total number of reads in the given sample. + */ + public int size(final String sample) { + return readStatesBySample.get(sample).size(); + } + + public SAMRecordState getFirst() { + for (final String sample : samples) { + PerSampleReadStateManager reads = readStatesBySample.get(sample); + if (!reads.isEmpty()) + return reads.peek(); + } + return null; + } + + public boolean hasNext() { + return totalReadStates > 0 || iterator.hasNext(); + } + + public void collectPendingReads() { + if (!iterator.hasNext()) + return; + + if (readStates.size() == 0) { + int firstContigIndex = iterator.peek().getReferenceIndex(); + int firstAlignmentStart = iterator.peek().getAlignmentStart(); + while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { + samplePartitioner.submitRead(iterator.next()); + } + } else { + // Fast fail in the case that the read is past the current position. + if (readIsPastCurrentPosition(iterator.peek())) + return; + + while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { + samplePartitioner.submitRead(iterator.next()); + } + } + + for (final String sample : samples) { + Collection newReads = samplePartitioner.getReadsForSample(sample); + PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + addReadsToSample(statesBySample, newReads); + } + + samplePartitioner.reset(); + } + + /** + * Add reads with the given sample name to the given hanger entry. + * + * @param readStates The list of read states to add this collection of reads. + * @param reads Reads to add. Selected reads will be pulled from this source. + */ + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { + if (reads.isEmpty()) + return; + + Collection newReadStates = new LinkedList(); + + for (SAMRecord read : reads) { + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); + } + + readStates.addStatesAtNextAlignmentStart(newReadStates); + } + + protected class PerSampleReadStateManager implements Iterable { + private List> readStatesByAlignmentStart = new LinkedList>(); + private int thisSampleReadStates = 0; + private Downsampler> levelingDownsampler = + performLevelingDownsampling ? + new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : + null; + + public void addStatesAtNextAlignmentStart(Collection states) { + if ( states.isEmpty() ) { + return; + } + + readStatesByAlignmentStart.add(new LinkedList(states)); + thisSampleReadStates += states.size(); + totalReadStates += states.size(); + + if ( levelingDownsampler != null ) { + levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.signalEndOfInput(); + + thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + levelingDownsampler.reset(); + } + } + + public boolean isEmpty() { + return readStatesByAlignmentStart.isEmpty(); + } + + public SAMRecordState peek() { + return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); + } + + public int size() { + return thisSampleReadStates; + } + + public Iterator iterator() { + return new Iterator() { + private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates = null; + private Iterator currentPositionReadStatesIterator = null; + + public boolean hasNext() { + return alignmentStartIterator.hasNext() || + (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); + } + + public SAMRecordState next() { + if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { + currentPositionReadStates = alignmentStartIterator.next(); + currentPositionReadStatesIterator = currentPositionReadStates.iterator(); + } + + return currentPositionReadStatesIterator.next(); + } + + public void remove() { + currentPositionReadStatesIterator.remove(); + thisSampleReadStates--; + totalReadStates--; + + if ( currentPositionReadStates.isEmpty() ) { + alignmentStartIterator.remove(); + } + } + }; + } + } + } + + /** + * Note: stores reads by sample ID string, not by sample object + */ + private class SamplePartitioner { + private Map> readsBySample; + private long readsSeen = 0; + + public SamplePartitioner() { + readsBySample = new HashMap>(); + + for ( String sample : samples ) { + readsBySample.put(sample, new ArrayList()); + } + } + + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).add(read); + readsSeen++; + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public Collection getReadsForSample(String sampleName) { + if ( ! readsBySample.containsKey(sampleName) ) + throw new NoSuchElementException("Sample name not found"); + return readsBySample.get(sampleName); + } + + public void reset() { + for ( Collection perSampleReads : readsBySample.values() ) + perSampleReads.clear(); + readsSeen = 0; + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java new file mode 100644 index 000000000..28348ecc2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java @@ -0,0 +1,144 @@ +package org.broadinstitute.sting.gatk.iterators; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Baseclass used to describe a read transformer like BAQ and BQSR + * + * Read transformers are plugable infrastructure that modify read state + * either on input, on output, or within walkers themselves. + * + * The function apply() is called on each read seen by the GATK (after passing + * all ReadFilters) and it can do as it sees fit (without modifying the alignment) + * to the read to change qualities, add tags, etc. + * + * Initialize is called once right before the GATK traversal begins providing + * the ReadTransformer with the ability to collect and initialize data from the + * engine. + * + * Note that all ReadTransformers within the classpath are created and initialized. If one + * shouldn't be run it should look at the command line options of the engine and override + * the enabled. + * + * @since 8/31/12 + * @author depristo + */ +abstract public class ReadTransformer { + /** + * When should this read transform be applied? + */ + private ApplicationTime applicationTime; + + /** + * Keep track of whether we've been initialized already, and ensure it's not called more than once. + */ + private boolean initialized = false; + + protected ReadTransformer() {} + + /** + * Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine. + * + * @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself + * @param engine the engine, for initializing values + * @param walker the walker we intend to run + */ + @Requires({"initialized == false", "engine != null", "walker != null"}) + @Ensures("initialized == true") + public final void initialize(final ApplicationTime overrideTime, final GenomeAnalysisEngine engine, final Walker walker) { + if ( engine == null ) throw new IllegalArgumentException("engine cannot be null"); + if ( walker == null ) throw new IllegalArgumentException("walker cannot be null"); + + this.applicationTime = initializeSub(engine, walker); + if ( overrideTime != null ) this.applicationTime = overrideTime; + initialized = true; + } + + /** + * Subclasses must override this to initialize themeselves + * + * @param engine the engine, for initializing values + * @param walker the walker we intend to run + * @return the point of time we'd like this read transform to be run + */ + @Requires({"engine != null", "walker != null"}) + @Ensures("result != null") + protected abstract ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker); + + /** + * Should this ReadTransformer be activated? Called after initialize, which allows this + * read transformer to look at its arguments and decide if it should be active. All + * ReadTransformers must override this, as by default they are not enabled. + * + * @return true if this ReadTransformer should be used on the read stream + */ + public boolean enabled() { + return false; + } + + /** + * Has this transformer been initialized? + * + * @return true if it has + */ + public final boolean isInitialized() { + return initialized; + } + + /** + * When should we apply this read transformer? + * + * @return true if yes + */ + public final ApplicationTime getApplicationTime() { + return applicationTime; + } + + /** + * Primary interface function for a read transform to actually do some work + * + * The function apply() is called on each read seen by the GATK (after passing + * all ReadFilters) and it can do as it sees fit (without modifying the alignment) + * to the read to change qualities, add tags, etc. + * + * @param read the read to transform + * @return the transformed read + */ + @Requires("read != null") + @Ensures("result != null") + abstract public GATKSAMRecord apply(final GATKSAMRecord read); + + @Override + public String toString() { + return getClass().getSimpleName(); + } + + /** + * When should a read transformer be applied? + */ + public static enum ApplicationTime { + /** + * Walker does not tolerate this read transformer + */ + FORBIDDEN, + + /** + * apply the transformation to the incoming reads, the default + */ + ON_INPUT, + + /** + * apply the transformation to the outgoing read stream + */ + ON_OUTPUT, + + /** + * the walker will deal with the calculation itself + */ + HANDLED_IN_WALKER + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java new file mode 100644 index 000000000..be227619f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.gatk.iterators; + +import java.lang.annotation.*; + +/** + * User: hanna + * Date: May 14, 2009 + * Time: 1:51:22 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Allows the walker to indicate what type of data it wants to consume. + */ + +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface ReadTransformersMode { + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java new file mode 100644 index 000000000..314baad3d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.gatk.samples; + +/** + * A class for imposing a trio structure on three samples; a common paradigm + * + * todo -- there should probably be an interface or abstract class "Pedigree" that generalizes the notion of + * -- imposing structure on samples. But given how complex pedigrees can quickly become, it's not + * -- clear the best way to do this. + */ +public class Trio { + private Sample mother; + private Sample father; + private Sample child; + + public Trio(Sample mom, Sample dad, Sample spawn) { + assert mom.getID().equals(spawn.getMaternalID()) && dad.getID().equals(spawn.getPaternalID()) : "Samples passed to trio constructor do not form a trio"; + mother = mom; + father = dad; + child = spawn; + } + + public Sample getMother() { + return mother; + } + + public String getMaternalID() { + return mother.getID(); + } + + public Sample getFather() { + return father; + } + + public String getPaternalID() { + return father.getID(); + } + + public Sample getChild() { + return child; + } + + public String getChildID() { + return child.getID(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java new file mode 100755 index 000000000..efa2eca02 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java @@ -0,0 +1,103 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public abstract class TraverseLociBase extends TraversalEngine,LocusShardDataProvider> { + /** + * our log, which we want to capture anything from this class + */ + protected static final Logger logger = Logger.getLogger(TraversalEngine.class); + + @Override + protected final String getTraversalType() { + return "sites"; + } + + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + protected abstract TraverseResults traverse( final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum); + + @Override + public T traverse( LocusWalker walker, + LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = getLocusView( walker, dataProvider ); + + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + updateCumulativeMetrics(dataProvider.getShard()); + } + + // We have a final map call to execute here to clean up the skipped based from the + // last position in the ROD to that in the interval + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { + // only do this if the walker isn't done! + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); + if ( nSkipped > 0 ) { + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); + sum = walker.reduce(x, sum); + } + } + + return sum; + } + + /** + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype + * that comes along. + * @param walker walker to interrogate. + * @param dataProvider Data which which to drive the locus view. + * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. + */ + private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + if( dataSource == DataSource.READS ) + return new CoveredLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) + return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); + else + throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java new file mode 100755 index 000000000..22381092f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java @@ -0,0 +1,47 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.LocusView; +import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociLinear extends TraverseLociBase { + + @Override + protected TraverseResults traverse(LocusWalker walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) { + // We keep processing while the next reference location is within the interval + boolean done = false; + int numIterations = 0; + + while( locusView.hasNext() && ! done ) { + numIterations++; + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + final boolean keepMeP = walker.filter(tracker, refContext, locus); + if (keepMeP) { + final M x = walker.map(tracker, refContext, locus); + sum = walker.reduce(x, sum); + done = walker.isDone(); + } + + printProgress(locus.getLocation()); + } + + return new TraverseResults(numIterations, sum); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java new file mode 100755 index 000000000..e4e2254d0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -0,0 +1,205 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.LocusView; +import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; + +import java.util.Iterator; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociNano extends TraverseLociBase { + /** our log, which we want to capture anything from this class */ + private static final boolean DEBUG = false; + private static final int BUFFER_SIZE = 1000; + + final NanoScheduler nanoScheduler; + + public TraverseLociNano(int nThreads) { + nanoScheduler = new NanoScheduler(BUFFER_SIZE, nThreads); + nanoScheduler.setProgressFunction(new TraverseLociProgress()); + } + + @Override + protected TraverseResults traverse(final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum) { + nanoScheduler.setDebug(DEBUG); + final TraverseLociMap myMap = new TraverseLociMap(walker); + final TraverseLociReduce myReduce = new TraverseLociReduce(walker); + + final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); + final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); + + return new TraverseResults(inputIterator.numIterations, result); + } + + /** + * Create iterator that provides inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + */ + private class MapDataIterator implements Iterator { + final LocusView locusView; + final LocusReferenceView referenceView; + final ReferenceOrderedView referenceOrderedDataView; + int numIterations = 0; + + private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { + this.locusView = locusView; + this.referenceView = referenceView; + this.referenceOrderedDataView = referenceOrderedDataView; + } + + @Override + public boolean hasNext() { + return locusView.hasNext(); + } + + @Override + public MapData next() { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + //logger.info("Pulling data from MapDataIterator at " + location); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext); + + numIterations++; + return new MapData(locus, refContext, tracker); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); + } + } + + @Override + public void printOnTraversalDone() { + nanoScheduler.shutdown(); + super.printOnTraversalDone(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final AlignmentContext alignmentContext; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.alignmentContext = alignmentContext; + this.refContext = refContext; + this.tracker = tracker; + } + + @Override + public String toString() { + return "MapData " + alignmentContext.getLocation(); + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseLociMap implements NSMapFunction { + final LocusWalker walker; + + private TraverseLociMap(LocusWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); + if (keepMeP) { + final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); + return new MapResult(x); + } + } + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseLociReduce implements NSReduceFunction { + final LocusWalker walker; + + private TraverseLociReduce(LocusWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } + + private class TraverseLociProgress implements NSProgressFunction { + @Override + public void progress(MapData lastProcessedMap) { + if (lastProcessedMap.alignmentContext != null) + printProgress(lastProcessedMap.alignmentContext.getLocation()); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java new file mode 100755 index 000000000..b3a0a1390 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ReadView; +import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.LinkedList; +import java.util.List; + +/** + * A nano-scheduling version of TraverseReads. + * + * Implements the traversal of a walker that accepts individual reads, the reference, and + * RODs per map call. Directly supports shared memory parallelism via NanoScheduler + * + * @author depristo + * @version 1.0 + * @date 9/2/2012 + */ +public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { + /** our log, which we want to capture anything from this class */ + protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); + private static final boolean DEBUG = false; + final NanoScheduler nanoScheduler; + + public TraverseReadsNano(int nThreads) { + final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max + nanoScheduler = new NanoScheduler(bufferSize, nThreads); + } + + @Override + protected String getTraversalType() { + return "reads"; + } + + /** + * Traverse by reads, given the data and the walker + * + * @param walker the walker to traverse with + * @param dataProvider the provider of the reads data + * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function + * @return the reduce variable of the read walker + */ + public T traverse(ReadWalker walker, + ReadShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider)); + + if( !dataProvider.hasReads() ) + throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); + + nanoScheduler.setDebug(DEBUG); + final TraverseReadsMap myMap = new TraverseReadsMap(walker); + final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); + + final List aggregatedInputs = aggregateMapData(dataProvider); + final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce); + + final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; + final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); + + updateCumulativeMetrics(dataProvider.getShard()); + printProgress(locus); + + return result; + } + + /** + * Aggregate all of the inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + * + * @param dataProvider the source of our data + * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce + * should execute + */ + private List aggregateMapData(final ReadShardDataProvider dataProvider) { + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + + final List mapData = new LinkedList(); + for ( final SAMRecord read : reads ) { + final ReferenceContext refContext = ! read.getReadUnmappedFlag() + ? reference.getReferenceContext(read) + : null; + + // if the read is mapped, create a metadata tracker + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 + ? rodView.getReferenceOrderedDataForRead(read) + : null; + + // update the number of reads we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker)); + } + + return mapData; + } + + @Override + public void printOnTraversalDone() { + nanoScheduler.shutdown(); + super.printOnTraversalDone(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final GATKSAMRecord read; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.read = read; + this.refContext = refContext; + this.tracker = tracker; + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseReadsMap implements NSMapFunction { + final ReadWalker walker; + + private TraverseReadsMap(ReadWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.refContext, data.read); + if (keepMeP) + return new MapResult(walker.map(data.refContext, data.read, data.tracker)); + } + + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseReadsReduce implements NSReduceFunction { + final ReadWalker walker; + + private TraverseReadsReduce(ReadWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java new file mode 100755 index 000000000..731ce7e4e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010. The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers; + +/** + * Root parallelism interface. Walkers that implement this + * declare that their map function is thread-safe and so multiple + * map calls can be run in parallel in the same JVM instance. + */ +public interface NanoSchedulable { +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java new file mode 100755 index 000000000..2b9744b89 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.fasta; + +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.Collections; +import java.util.List; + + +/** + * Generates an alternative reference sequence over the specified interval. + * + *

+ * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). + * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * Several important notes: + * 1) if there are multiple variants that start at a site, it chooses one of them randomly. + * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. + * 3) this tool works only for SNPs and for simple indels (but not for things like complex substitutions). + * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). + * + *

Input

+ *

+ * The reference, requested intervals, and any number of variant rod files. + *

+ * + *

Output

+ *

+ * A fasta file representing the requested intervals. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T FastaAlternateReferenceMaker \
+ *   -o output.fasta \
+ *   -L input.intervals \
+ *   --variant input.vcf \
+ *   [--snpmask mask.vcf]
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = "Companion Utilities", extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-1,stop=50)) +@Requires(value={DataSource.REFERENCE}) +public class FastaAlternateReferenceMaker extends FastaReferenceMaker { + + /** + * Variants from these input files are used by this tool to construct an alternate reference. + */ + @Input(fullName = "variant", shortName = "V", doc="variants to model", required=false) + public List> variants = Collections.emptyList(); + + /** + * Snps from this file are used as a mask when constructing the alternate reference. + */ + @Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false) + public RodBinding snpmask; + + private int deletionBasesRemaining = 0; + + public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + if (deletionBasesRemaining > 0) { + deletionBasesRemaining--; + return new Pair(context.getLocation(), ""); + } + + String refBase = String.valueOf((char)ref.getBase()); + + // Check to see if we have a called snp + for ( VariantContext vc : tracker.getValues(variants, ref.getLocus()) ) { + if ( vc.isFiltered() ) + continue; + + if ( vc.isSimpleDeletion()) { + deletionBasesRemaining = vc.getReference().length() - 1; + // delete the next n bases, not this one + return new Pair(context.getLocation(), refBase); + } else if ( vc.isSimpleInsertion()) { + return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); + } else if (vc.isSNP()) { + return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); + } + } + + // if we don't have a called site, and we have a mask at this site, mask it + for ( VariantContext vc : tracker.getValues(snpmask) ) { + if ( vc.isSNP()) { + return new Pair(context.getLocation(), "N"); + } + } + + + // if we got here then we're just ref + return new Pair(context.getLocation(), refBase); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java new file mode 100755 index 000000000..362867318 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.fasta; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; + +import java.io.PrintStream; + +/** + * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. + * + *

+ * The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). + * + *

Input

+ *

+ * The reference and requested intervals. + *

+ * + *

Output

+ *

+ * A fasta file representing the requested intervals. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T FastaReferenceMaker \
+ *   -o output.fasta \
+ *   -L input.intervals
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = "Companion Utilities", extraDocs = {CommandLineGATK.class} ) +public class FastaReferenceMaker extends RefWalker, GenomeLoc> { + + @Output PrintStream out; + + @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) + public int fastaLineWidth=60; + + /** + * Please note that when using this argument adjacent intervals will automatically be merged. + */ + @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false) + public boolean fastaRawSeqs=false; + + protected FastaSequence fasta; + + public void initialize() { + if (fastaRawSeqs) fastaLineWidth = Integer.MAX_VALUE; + fasta = new FastaSequence(out, fastaLineWidth, fastaRawSeqs); + } + + public Pair map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) { + return new Pair(context.getLocation(), String.valueOf((char)ref.getBase())); + } + + public GenomeLoc reduceInit() { + return null; + } + + public GenomeLoc reduce(Pair value, GenomeLoc sum) { + if ( value == null ) + return sum; + + // if there is no interval to the left, then this is the first one + if ( sum == null ) { + sum = value.first; + fasta.append(value.second); + } + // if the intervals don't overlap, print out the leftmost one and start a new one + // (end of contig or new interval) + else if ( value.first.getStart() != sum.getStop() + 1 ) { + fasta.flush(); + sum = value.first; + fasta.append(value.second); + } + // otherwise, merge them + else { + sum = getToolkit().getGenomeLocParser().setStop(sum, value.first.getStop()); + fasta.append(value.second); + } + return sum; + } + + public void onTraversalDone(GenomeLoc sum) { + fasta.flush(); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java new file mode 100644 index 000000000..4589ffb71 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.utils.baq; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.BAQMode; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Applies Heng's BAQ calculation to a stream of incoming reads + */ +public class BAQReadTransformer extends ReadTransformer { + private BAQ baqHMM; + private IndexedFastaSequenceFile refReader; + private BAQ.CalculationMode cmode; + private BAQ.QualityMode qmode; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + final BAQMode mode = WalkerManager.getWalkerAnnotation(walker, BAQMode.class); + this.refReader = engine.getReferenceDataSource().getReference(); + this.cmode = engine.getArguments().BAQMode; + this.qmode = mode.QualityMode(); + baqHMM = new BAQ(engine.getArguments().BAQGOP); + + if ( qmode == BAQ.QualityMode.DONT_MODIFY ) + throw new ReviewedStingException("BUG: shouldn't create BAQ transformer with quality mode DONT_MODIFY"); + + if ( mode.ApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN && enabled() ) + throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + cmode + " was requested."); + + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return cmode != BAQ.CalculationMode.OFF; + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + baqHMM.baqRead(read, refReader, cmode, qmode); + return read; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java new file mode 100644 index 000000000..18ab9e01a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java @@ -0,0 +1,44 @@ +package org.broadinstitute.sting.utils.baq; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Iterator; + +/** + * Iterator that applies a ReadTransformer to a stream of reads + */ +public class ReadTransformingIterator implements StingSAMIterator { + private final StingSAMIterator it; + private final ReadTransformer transformer; + + /** + * Creates a new ReadTransforming iterator + */ + @Requires({"it != null", "transformer != null", "transformer.isInitialized()"}) + public ReadTransformingIterator(final StingSAMIterator it, final ReadTransformer transformer) { + if ( ! transformer.isInitialized() ) + throw new IllegalStateException("Creating a read transformer stream for an uninitialized read transformer: " + transformer); + if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN ) + throw new IllegalStateException("Creating a read transformer stream for a forbidden transformer " + transformer); + + this.it = it; + this.transformer = transformer; + } + + @Requires("hasNext()") + @Ensures("result != null") + public SAMRecord next() { + final GATKSAMRecord read = (GATKSAMRecord)it.next(); + return transformer.apply(read); + } + + public boolean hasNext() { return this.it.hasNext(); } + public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } + public void close() { it.close(); } + public Iterator iterator() { return this; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java new file mode 100644 index 000000000..2daa6c9eb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java @@ -0,0 +1,82 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import com.google.java.contract.Invariant; + +/** + * Wrapper to hold data for a blocking queue, distinguishing an EOF marker from a real object + * + * The only way to tell in a consumer thread that a blocking queue has no more data ever + * coming down the pipe is to pass in a "poison" or EOF object. This class provides + * a generic capacity for that... + * + * The use case looks like this: + * + * BlockingQueue q + * producer: + * while ( x has items ) + * q.put(new BlockingQueueValue(x)) + * q.put(new BlockingQueueValue()) + * + * Consumer: + * while ( true ) + * value = q.take() + * if ( value.isLast() ) + * break + * else + * do something useful with value + * + * + * User: depristo + * Date: 9/6/12 + * Time: 3:08 PM + */ +@Invariant("! isLast || value == null") +class BlockingQueueValue { + /** + * True if this is the EOF marker object + */ + final private boolean isLast; + + /** + * Our value, if we aren't the EOF marker + */ + final private T value; + + /** + * Create a new BlockingQueueValue containing a real value, where last is false + * @param value + */ + BlockingQueueValue(final T value) { + isLast = false; + this.value = value; + } + + /** + * Create a new BlockingQueueValue that is the last item + */ + BlockingQueueValue() { + isLast = true; + this.value = null; + } + + /** + * Is this the EOF marker? + * + * @return true if so, else false + */ + public boolean isLast() { + return isLast; + } + + /** + * Get the value held by this BlockingQueueValue + * + * @return the value + * @throws IllegalStateException if this is the last item + */ + public T getValue() { + if ( isLast() ) + throw new IllegalStateException("Cannot get value for last object"); + return value; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java new file mode 100644 index 000000000..9508a15aa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +/** + * Create a future that simply returns a given value + * + * The only standard way to create a future in java is via the ExecutorService interface. + * If you have a data structure holding futures of value T, and you want to add a + * value to it for some reason (to add a EOF marker, for instance) you can use this + * class to create a dummy Future that simply returns a value. + * + * @author depristo + * @since 09/12 + */ +class FutureValue implements Future { + final V value; + + FutureValue(final V value) { + this.value = value; + } + + @Override public boolean cancel(boolean mayInterruptIfRunning) { + return true; + } + + @Override public boolean isCancelled() { + return false; + } + + @Override public boolean isDone() { + return true; + } + + @Override public V get() throws InterruptedException, ExecutionException { + return value; + } + + @Override public V get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException { + return get(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java new file mode 100644 index 000000000..29dddbc49 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Iterator; +import java.util.concurrent.BlockingQueue; + +/** + * Producer Thread that reads input values from an inputReads and puts them into a BlockingQueue + */ +class InputProducer implements Runnable { + /** + * The iterator we are using to get data from + */ + final Iterator inputReader; + + /** + * Our timer (may be null) that we use to track our input costs + */ + final SimpleTimer inputTimer; + + /** + * Where we put our input values for consumption + */ + final BlockingQueue outputQueue; + + public InputProducer(final Iterator inputReader, + final SimpleTimer inputTimer, + final BlockingQueue outputQueue) { + if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); + if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null"); + + this.inputReader = inputReader; + this.inputTimer = inputTimer; + this.outputQueue = outputQueue; + } + + public void run() { + try { + while ( inputReader.hasNext() ) { + if ( inputTimer != null ) inputTimer.restart(); + final InputType input = inputReader.next(); + if ( inputTimer != null ) inputTimer.stop(); + outputQueue.put(new InputValue(input)); + } + + // add the EOF object so our consumer knows we are done in all inputs + outputQueue.put(new InputValue()); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } + + /** + * Helper class that contains a read value suitable for EOF marking in a BlockingQueue + */ + class InputValue extends BlockingQueueValue { + private InputValue(InputType datum) { super(datum); } + private InputValue() { } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java new file mode 100644 index 000000000..3cc6fa786 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Holds the results of a map job suitable for producer/consumer threading + * via a BlockingQueue + */ +class MapResult extends BlockingQueueValue { + final int jobID; + + /** + * Create a new MapResult with value datum and jod jobID ID + * + * @param datum the value produced by the map job + * @param jobID the id of the map job (for correctness testing) + */ + MapResult(final MapType datum, final int jobID) { + super(datum); + this.jobID = jobID; + if ( jobID < 0 ) throw new IllegalArgumentException("JobID must be >= 0"); + } + + /** + * Create the EOF marker version of MapResult + */ + MapResult() { + super(); + this.jobID = Integer.MAX_VALUE; + } + + /** + * @return the job ID of the map job that produced this MapResult + */ + public int getJobID() { + return jobID; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java new file mode 100644 index 000000000..cc5335051 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java @@ -0,0 +1,19 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * A function that maps from InputType -> ResultType + * + * For use with the NanoScheduler + * + * User: depristo + * Date: 8/24/12 + * Time: 9:49 AM + */ +public interface NSMapFunction { + /** + * Return function on input, returning a value of ResultType + * @param input + * @return + */ + public ResultType apply(final InputType input); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java new file mode 100644 index 000000000..8b12c62c4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java @@ -0,0 +1,12 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 9/4/12 + * Time: 2:10 PM + * To change this template use File | Settings | File Templates. + */ +public interface NSProgressFunction { + public void progress(final InputType lastMapInput); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java new file mode 100644 index 000000000..879a33a1d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java @@ -0,0 +1,18 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * A function that combines a value of MapType with an existing ReduceValue into a new ResultType + * + * User: depristo + * Date: 8/24/12 + * Time: 9:49 AM + */ +public interface NSReduceFunction { + /** + * Combine one with sum into a new ReduceType + * @param one the result of a map call on an input element + * @param sum the cumulative reduce result over all previous map calls + * @return + */ + public ReduceType apply(MapType one, ReduceType sum); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java new file mode 100644 index 000000000..664fb7b9b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -0,0 +1,392 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.threading.NamedThreadFactory; + +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.*; + +/** + * Framework for very fine grained MapReduce parallelism + * + * The overall framework works like this + * + * nano <- new Nanoschedule(inputBufferSize, numberOfMapElementsToProcessTogether, nThreads) + * List[Input] outerData : outerDataLoop ) + * result = nano.execute(outerData.iterator(), map, reduce) + * + * inputBufferSize determines how many elements from the input stream are read in one go by the + * nanoscheduler. The scheduler may hold up to inputBufferSize in memory at one time, as well + * as up to inputBufferSize map results as well. + * + * numberOfMapElementsToProcessTogether determines how many input elements are processed + * together each thread cycle. For example, if this value is 10, then the input data + * is grouped together in units of 10 elements each, and map called on each in term. The more + * heavy-weight the map function is, in terms of CPU costs, the more it makes sense to + * have this number be small. The lighter the CPU cost per element, though, the more this + * parameter introduces overhead due to need to context switch among threads to process + * each input element. A value of -1 lets the nanoscheduler guess at a reasonable trade-off value. + * + * nThreads is a bit obvious yes? Note though that the nanoscheduler assumes that it gets 1 thread + * from its client during the execute call, as this call blocks until all work is done. The caller + * thread is put to work by execute to help with the processing of the data. So in reality the + * nanoScheduler only spawn nThreads - 1 additional workers (if this is > 1). + * + * User: depristo + * Date: 8/24/12 + * Time: 9:47 AM + */ +public class NanoScheduler { + private final static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; + private final static boolean LOG_MAP_TIMES = false; + private final static boolean TIME_CALLS = true; + + private final static int MAP_BUFFER_SIZE_SCALE_FACTOR = 100; + + final int inputBufferSize; + final int mapBufferSize; + final int nThreads; + final ExecutorService inputExecutor; + final ExecutorService reduceExecutor; + final ThreadPoolExecutor mapExecutor; + + boolean shutdown = false; + boolean debug = false; + private NSProgressFunction progressFunction = null; + + final SimpleTimer outsideSchedulerTimer = TIME_CALLS ? new SimpleTimer("outside") : null; + final SimpleTimer inputTimer = TIME_CALLS ? new SimpleTimer("input") : null; + final SimpleTimer mapTimer = TIME_CALLS ? new SimpleTimer("map") : null; + final SimpleTimer reduceTimer = TIME_CALLS ? new SimpleTimer("reduce") : null; + + /** + * Create a new nanoscheduler with the desire characteristics requested by the argument + * + * @param inputBufferSize the number of input elements to read in each scheduling cycle. + * @param nThreads the number of threads to use to get work done, in addition to the + * thread calling execute + */ + public NanoScheduler(final int inputBufferSize, final int nThreads) { + if ( inputBufferSize < 1 ) throw new IllegalArgumentException("inputBufferSize must be >= 1, got " + inputBufferSize); + if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); + + this.inputBufferSize = inputBufferSize; + this.mapBufferSize = inputBufferSize * MAP_BUFFER_SIZE_SCALE_FACTOR; + this.nThreads = nThreads; + + if ( nThreads == 1 ) { + this.mapExecutor = null; + this.inputExecutor = this.reduceExecutor = null; + } else { + this.mapExecutor = (ThreadPoolExecutor)Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d")); + this.mapExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); + this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); + this.reduceExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-reduce-thread-%d")); + } + + // start timing the time spent outside of the nanoScheduler + outsideSchedulerTimer.start(); + } + + /** + * The number of parallel map threads in use with this NanoScheduler + * @return + */ + @Ensures("result > 0") + public int getnThreads() { + return nThreads; + } + + /** + * The input buffer size used by this NanoScheduler + * @return + */ + @Ensures("result > 0") + public int getInputBufferSize() { + return inputBufferSize; + } + + /** + * Tells this nanoScheduler to shutdown immediately, releasing all its resources. + * + * After this call, execute cannot be invoked without throwing an error + */ + public void shutdown() { + outsideSchedulerTimer.stop(); + + if ( nThreads > 1 ) { + shutdownExecutor("inputExecutor", inputExecutor); + shutdownExecutor("mapExecutor", mapExecutor); + shutdownExecutor("reduceExecutor", reduceExecutor); + } + shutdown = true; + + if (TIME_CALLS) { + printTimerInfo("Input time", inputTimer); + printTimerInfo("Map time", mapTimer); + printTimerInfo("Reduce time", reduceTimer); + printTimerInfo("Outside time", outsideSchedulerTimer); + } + } + + /** + * Helper function to cleanly shutdown an execution service, checking that the execution + * state is clean when it's done. + * + * @param name a string name for error messages for the executorService we are shutting down + * @param executorService the executorService to shut down + */ + @Requires({"name != null", "executorService != null"}) + @Ensures("executorService.isShutdown()") + private void shutdownExecutor(final String name, final ExecutorService executorService) { + if ( executorService.isShutdown() || executorService.isTerminated() ) + throw new IllegalStateException("Executor service " + name + " is already shut down!"); + + final List remaining = executorService.shutdownNow(); + if ( ! remaining.isEmpty() ) + throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); + } + + /** + * Print to logger.info timing information from timer, with name label + * + * @param label the name of the timer to display. Should be human readable + * @param timer the timer whose elapsed time we will display + */ + @Requires({"label != null", "timer != null"}) + private void printTimerInfo(final String label, final SimpleTimer timer) { + final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + + reduceTimer.getElapsedTime() + outsideSchedulerTimer.getElapsedTime(); + final double myTimeInSec = timer.getElapsedTime(); + final double myTimePercent = myTimeInSec / total * 100; + logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); + } + + /** + * @return true if this nanoScheduler is shutdown, or false if its still open for business + */ + public boolean isShutdown() { + return shutdown; + } + + /** + * @return are we displaying verbose debugging information about the scheduling? + */ + public boolean isDebug() { + return debug; + } + + /** + * Helper function to display a String.formatted message if we are doing verbose debugging + * + * @param format the format argument suitable for String.format + * @param args the arguments for String.format + */ + @Requires("format != null") + private void debugPrint(final String format, Object ... args) { + if ( isDebug() ) + logger.info("Thread " + Thread.currentThread().getId() + ":" + String.format(format, args)); + } + + /** + * Turn on/off verbose debugging + * + * @param debug true if we want verbose debugging + */ + public void setDebug(boolean debug) { + this.debug = debug; + } + + /** + * Set the progress callback function to progressFunction + * + * The progress callback is invoked after each buffer size elements have been processed by map/reduce + * + * @param progressFunction a progress function to call, or null if you don't want any progress callback + */ + public void setProgressFunction(final NSProgressFunction progressFunction) { + this.progressFunction = progressFunction; + } + + /** + * Execute a map/reduce job with this nanoScheduler + * + * Data comes from inputReader. Will be read until hasNext() == false. + * map is called on each element provided by inputReader. No order of operations is guarenteed + * reduce is called in order of the input data provided by inputReader on the result of map() applied + * to each element. + * + * Note that the caller thread is put to work with this function call. The call doesn't return + * until all elements have been processes. + * + * It is safe to call this function repeatedly on a single nanoScheduler, at least until the + * shutdown method is called. + * + * Note that this function goes through a single threaded fast path if the number of threads + * is 1. + * + * @param inputReader an iterator providing us with the input data to nanoSchedule map/reduce over + * @param map the map function from input type -> map type, will be applied in parallel to each input + * @param reduce the reduce function from map type + reduce type -> reduce type to be applied in order to map results + * @return the last reduce value + */ + public ReduceType execute(final Iterator inputReader, + final NSMapFunction map, + final ReduceType initialValue, + final NSReduceFunction reduce) { + if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler"); + if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); + if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); + if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); + + outsideSchedulerTimer.stop(); + + ReduceType result; + if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { + result = executeSingleThreaded(inputReader, map, initialValue, reduce); + } else { + result = executeMultiThreaded(inputReader, map, initialValue, reduce); + } + + outsideSchedulerTimer.restart(); + return result; + } + + /** + * Simple efficient reference implementation for single threaded execution. + * + * @return the reduce result of this map/reduce job + */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) + private ReduceType executeSingleThreaded(final Iterator inputReader, + final NSMapFunction map, + final ReduceType initialValue, + final NSReduceFunction reduce) { + ReduceType sum = initialValue; + int i = 0; + + // start timer to ensure that both hasNext and next are caught by the timer + if ( TIME_CALLS ) inputTimer.restart(); + while ( inputReader.hasNext() ) { + final InputType input = inputReader.next(); + if ( TIME_CALLS ) inputTimer.stop(); + + // map + if ( TIME_CALLS ) mapTimer.restart(); + final long preMapTime = LOG_MAP_TIMES ? 0 : mapTimer.currentTimeNano(); + final MapType mapValue = map.apply(input); + if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (mapTimer.currentTimeNano() - preMapTime)); + if ( TIME_CALLS ) mapTimer.stop(); + + if ( i++ % inputBufferSize == 0 && progressFunction != null ) + progressFunction.progress(input); + + // reduce + if ( TIME_CALLS ) reduceTimer.restart(); + sum = reduce.apply(mapValue, sum); + if ( TIME_CALLS ) reduceTimer.stop(); + + if ( TIME_CALLS ) inputTimer.restart(); + } + + return sum; + } + + /** + * Efficient parallel version of Map/Reduce + * + * @return the reduce result of this map/reduce job + */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) + private ReduceType executeMultiThreaded(final Iterator inputReader, + final NSMapFunction map, + final ReduceType initialValue, + final NSReduceFunction reduce) { + debugPrint("Executing nanoScheduler"); + + // a blocking queue that limits the number of input datum to the requested buffer size + final BlockingQueue.InputValue> inputQueue + = new LinkedBlockingDeque.InputValue>(inputBufferSize); + + // a priority queue that stores up to mapBufferSize elements + // produced by completed map jobs. + final BlockingQueue>> mapResultQueue = + new LinkedBlockingDeque>>(mapBufferSize); + + // Start running the input reader thread + inputExecutor.submit(new InputProducer(inputReader, inputTimer, inputQueue)); + + // Start running the reducer thread + final ReducerThread reducer + = new ReducerThread(reduce, reduceTimer, initialValue, mapResultQueue); + final Future reduceResult = reduceExecutor.submit(reducer); + + try { + int numJobs = 0; + + while ( true ) { + // block on input + final InputProducer.InputValue inputEnqueueWrapped = inputQueue.take(); + + if ( ! inputEnqueueWrapped.isLast() ) { + // get the object itself + final InputType input = inputEnqueueWrapped.getValue(); + + // the next map call has jobID + 1 + numJobs++; + + // send job for map via the completion service + final CallableMap doMap = new CallableMap(map, numJobs, input); + final Future> mapJob = mapExecutor.submit(doMap); + mapResultQueue.put(mapJob); + + debugPrint(" Done with cycle of map/reduce"); + + if ( numJobs % inputBufferSize == 0 && progressFunction != null ) + progressFunction.progress(input); + } else { + mapResultQueue.put(new FutureValue>(new MapResult())); + return reduceResult.get(); // wait for our result of reduce + } + } + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } catch (ExecutionException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } + + /** + * A simple callable version of the map function for use with the executor pool + */ + private class CallableMap implements Callable> { + final int id; + final InputType input; + final NSMapFunction map; + + @Requires({"map != null"}) + private CallableMap(final NSMapFunction map, + final int id, + final InputType input) { + this.id = id; + this.input = input; + this.map = map; + } + + @Override + public MapResult call() { + if ( TIME_CALLS ) mapTimer.restart(); + if ( debug ) debugPrint("\t\tmap " + input); + final MapType result = map.apply(input); + if ( TIME_CALLS ) mapTimer.stop(); + return new MapResult(result, id); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java new file mode 100644 index 000000000..506e45453 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; + +/** + * Thread that runs the reduce of the map/reduce. + * + * This thread reads from mapResultsQueue until the poison EOF object arrives. At each + * stage is calls reduce(value, sum). The blocking mapResultQueue ensures that the + * queue waits until the mapResultQueue has a value to take. Then, it gets and waits + * until the map result Future has a value. + */ +class ReducerThread implements Callable { + final NSReduceFunction reduce; + final SimpleTimer reduceTimer; + final BlockingQueue>> mapResultQueue; + + ReduceType sum; + int lastJobID = -1; + + public ReducerThread(final NSReduceFunction reduce, + final SimpleTimer reduceTimer, + final ReduceType sum, + final BlockingQueue>> mapResultQueue) { + if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null"); + if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null"); + + this.reduce = reduce; + this.reduceTimer = reduceTimer; + this.sum = sum; + this.mapResultQueue = mapResultQueue; + } + + public ReduceType call() { + try { + while ( true ) { + final MapResult result = mapResultQueue.take().get(); + if ( result.isLast() ) { + // we are done, just return sum + return sum; + } + else if ( result.getJobID() < lastJobID ) { + // make sure the map results are coming in order + throw new IllegalStateException("BUG: last jobID " + lastJobID + " > current jobID " + result.getJobID()); + } else { + lastJobID = result.getJobID(); + // apply reduce, keeping track of sum + if ( reduceTimer != null ) reduceTimer.restart(); + sum = reduce.apply(result.getValue(), sum); + if ( reduceTimer != null ) reduceTimer.stop(); + } + } + } catch (ExecutionException ex) { + throw new ReviewedStingException("got execution exception", ex); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java new file mode 100644 index 000000000..431014032 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; + +import java.lang.annotation.*; + +/** + * User: hanna + * Date: May 14, 2009 + * Time: 1:51:22 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Allows the walker to indicate what type of data it wants to consume. + */ + +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface BQSRMode { + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java new file mode 100644 index 000000000..fae0e8c09 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java @@ -0,0 +1,40 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * A ReadTransformer that applies BQSR on the fly to reads + * + * User: rpoplin + * Date: 2/13/12 + */ +public class BQSRReadTransformer extends ReadTransformer { + private boolean enabled; + private BaseRecalibration bqsr; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + this.enabled = engine.hasBaseRecalibration(); + this.bqsr = engine.getBaseRecalibration(); + final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class); + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return enabled; + } + + /** + * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads. + */ + @Override + public GATKSAMRecord apply(GATKSAMRecord read) { + bqsr.recalibrateRead(read); + return read; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java new file mode 100644 index 000000000..7c2d9bfdc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.picard.sam.MergingSamRecordIterator; +import net.sf.picard.sam.SamFileHeaderMerger; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Simple wrapper class that multiplexes multiple ArtificialSingleSampleReadStreams into a single stream of reads + * + * @author David Roazen + */ +public class ArtificialMultiSampleReadStream implements Iterable { + + private Collection perSampleArtificialReadStreams; + private MergingSamRecordIterator mergingIterator; + + public ArtificialMultiSampleReadStream( Collection perSampleArtificialReadStreams ) { + if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) { + throw new ReviewedStingException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams"); + } + + this.perSampleArtificialReadStreams = perSampleArtificialReadStreams; + } + + public Iterator iterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return mergingIterator; + } + + public StingSAMIterator getStingSAMIterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return StingSAMIteratorAdapter.adapt(mergingIterator); + } + + private void initialize() { + Collection perSampleSAMReaders = new ArrayList(perSampleArtificialReadStreams.size()); + Collection headers = new ArrayList(perSampleArtificialReadStreams.size()); + + for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) { + Collection thisStreamReads = readStream.makeReads(); + + SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(), + thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()])); + perSampleSAMReaders.add(reader); + headers.add(reader.getFileHeader()); + } + + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); + mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java new file mode 100644 index 000000000..a9480692b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; + +/** + * An artificial stream of reads from a single read group/sample with configurable characteristics + * such as: + * + * -the number of contigs that the reads should be distributed across + * -number of "stacks" of reads sharing the same alignment start position per contig + * -the min/max number of reads in each stack (exact values chosen randomly from this range) + * -the min/max distance between stack start positions (exact values chosen randomly from this range) + * -the min/max length of each read (exact values chosen randomly from this range) + * -the number of unmapped reads + * + * The cigar string for all reads will be *M, where * is the length of the read. + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStream implements Iterable { + private SAMFileHeader header; + private String readGroupID; + private int numContigs; + private int numStacksPerContig; + private int minReadsPerStack; + private int maxReadsPerStack; + private int minDistanceBetweenStacks; + private int maxDistanceBetweenStacks; + private int minReadLength; + private int maxReadLength; + private int numUnmappedReads; + + private static final String READ_GROUP_TAG = "RG"; + + public ArtificialSingleSampleReadStream( SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + this.header = header; + this.readGroupID = readGroupID; + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + this.minReadLength = minReadLength; + this.maxReadLength = maxReadLength; + this.numUnmappedReads = numUnmappedReads; + + validateStreamParameters(); + } + + private void validateStreamParameters() { + if ( header == null || readGroupID == null ) { + throw new ReviewedStingException("null SAMFileHeader or read group ID") ; + } + + if ( header.getReadGroup(readGroupID) == null ) { + throw new ReviewedStingException(String.format("Read group %s not found in SAMFileHeader", readGroupID)); + } + + if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 || + minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 || + numUnmappedReads < 0 ) { + throw new ReviewedStingException("Read stream parameters must be >= 0"); + } + + if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) { + throw new ReviewedStingException("numContigs and numStacksPerContig must either both be > 0, or both be 0"); + } + + if ( minReadsPerStack > maxReadsPerStack ) { + throw new ReviewedStingException("minReadsPerStack > maxReadsPerStack"); + } + + if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) { + throw new ReviewedStingException("minDistanceBetweenStacks > maxDistanceBetweenStacks"); + } + + if ( minReadLength > maxReadLength ) { + throw new ReviewedStingException("minReadLength > maxReadLength"); + } + } + + public Iterator iterator() { + return makeReads().iterator(); + } + + public StingSAMIterator getStingSAMIterator() { + return StingSAMIteratorAdapter.adapt(iterator()); + } + + public Collection makeReads() { + Collection reads = new ArrayList(numContigs * numStacksPerContig * maxReadsPerStack); + + for ( int contig = 0; contig < numContigs; contig++ ) { + int alignmentStart = 1; + + for ( int stack = 0; stack < numStacksPerContig; stack++ ) { + reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack))); + alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + if ( numUnmappedReads > 0 ) { + reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads)); + } + + return reads; + } + + private Collection makeReadStack( int contig, int alignmentStart, int stackSize ) { + Collection readStack = new ArrayList(stackSize); + + for ( int i = 0; i < stackSize; i++ ) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, + "foo", + contig, + alignmentStart, + MathUtils.randomIntegerInRange(minReadLength, maxReadLength)); + read.setAttribute(READ_GROUP_TAG, readGroupID); + readStack.add(read); + } + + return readStack; + } + + public SAMFileHeader getHeader() { + return header; + } + + public String getReadGroupID() { + return readGroupID; + } + + public int getNumContigs() { + return numContigs; + } + + public int getNumStacksPerContig() { + return numStacksPerContig; + } + + public int getMinReadsPerStack() { + return minReadsPerStack; + } + + public int getMaxReadsPerStack() { + return maxReadsPerStack; + } + + public int getMinDistanceBetweenStacks() { + return minDistanceBetweenStacks; + } + + public int getMaxDistanceBetweenStacks() { + return maxDistanceBetweenStacks; + } + + public int getMinReadLength() { + return minReadLength; + } + + public int getMaxReadLength() { + return maxReadLength; + } + + public int getNumUnmappedReads() { + return numUnmappedReads; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..a4d7c5146 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.List; + +/** + * A class for analyzing and validating the read stream produced by an ArtificialSingleSampleReadStream. + * + * Collects various statistics about the stream of reads it's fed, and validates the stream + * by checking whether the collected statistics match the nominal properties of the stream. + * + * Subclasses are expected to override the validate() method in order to check whether an artificial + * read stream has been *transformed* in some way (eg., by downsampling or some other process), rather + * than merely checking whether the stream matches its original properties. + * + * Usage is simple: + * + * ArtificialSingleSampleReadStreamAnalyzer analyzer = new ArtificialSingleSampleReadStreamAnalyzer(originalStream); + * analyzer.analyze(originalOrTransformedStream); + * analyzer.validate(); // override this method if you want to check whether the stream has been transformed + * // in a certain way relative to the original stream + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStreamAnalyzer { + protected ArtificialSingleSampleReadStream originalStream; + protected SAMRecord lastRead; + protected int totalReads; + protected boolean allSamplesMatch; + protected int numContigs; + protected List stacksPerContig; + protected Integer minReadsPerStack; + protected Integer maxReadsPerStack; + protected Integer minDistanceBetweenStacks; + protected Integer maxDistanceBetweenStacks; + protected Integer minReadLength; + protected Integer maxReadLength; + protected int numUnmappedReads; + + protected int currentContigNumStacks; + protected int currentStackNumReads; + + /** + * Construct a new read stream analyzer, providing an ArtificialSingleSampleReadStream that will + * serve as the basis for comparison after the analysis is complete. + * + * @param originalStream the original ArtificialSingleSampleReadStream upon which the stream + * that will be fed to the analyzer is based + */ + public ArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream ) { + this.originalStream = originalStream; + reset(); + } + + /** + * Reset all read stream statistics collected by this analyzer to prepare for a fresh run + */ + public void reset() { + lastRead = null; + totalReads = 0; + allSamplesMatch = true; + numContigs = 0; + stacksPerContig = new ArrayList(); + minReadsPerStack = null; + maxReadsPerStack = null; + minDistanceBetweenStacks = null; + maxDistanceBetweenStacks = null; + minReadLength = null; + maxReadLength = null; + numUnmappedReads = 0; + currentContigNumStacks = 0; + currentStackNumReads = 0; + } + + /** + * Collect statistics on the stream of reads passed in + * + * @param stream the stream of reads to analyze + */ + public void analyze( Iterable stream ) { + for ( SAMRecord read : stream ) { + update(read); + } + finalizeStats(); + } + + /** + * Validate the stream by checking whether our collected statistics match the properties of the + * original stream. Throws a ReviewedStingException if the stream is invalid. + * + * Override this method if you want to check whether the stream has been transformed in some + * way relative to the original stream. + */ + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + if ( minReadsPerStack < originalStream.getMinReadsPerStack() ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads"); + } + if ( maxReadsPerStack > originalStream.getMaxReadsPerStack() ) { + throw new ReviewedStingException("stack had more than the maximum number of reads"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } + + public void update( SAMRecord read ) { + if ( read.getReadUnmappedFlag() ) { + numUnmappedReads++; + + if ( numUnmappedReads == 1 && lastRead != null ) { + processContigChange(); + numContigs--; + } + } + else if ( lastRead == null ) { + numContigs = 1; + currentContigNumStacks = 1; + currentStackNumReads = 1; + } + else if ( ! read.getReferenceIndex().equals(lastRead.getReferenceIndex()) ) { + processContigChange(); + } + else if ( read.getAlignmentStart() != lastRead.getAlignmentStart() ) { + processStackChangeWithinContig(read); + } + else { + currentStackNumReads++; + } + + updateReadLength(read.getReadLength()); + allSamplesMatch = allSamplesMatch && readHasCorrectSample(read); + totalReads++; + + lastRead = read; + } + + + private void processContigChange() { + numContigs++; + + stacksPerContig.add(currentContigNumStacks); + currentContigNumStacks = 1; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + } + + private void processStackChangeWithinContig( SAMRecord read ) { + currentContigNumStacks++; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + + updateDistanceBetweenStacks(read.getAlignmentStart() - lastRead.getAlignmentStart()); + } + + private void updateReadsPerStack( int stackReadCount ) { + if ( minReadsPerStack == null || stackReadCount < minReadsPerStack ) { + minReadsPerStack = stackReadCount; + } + if ( maxReadsPerStack == null || stackReadCount > maxReadsPerStack ) { + maxReadsPerStack = stackReadCount; + } + } + + private void updateDistanceBetweenStacks( int stackDistance ) { + if ( minDistanceBetweenStacks == null || stackDistance < minDistanceBetweenStacks ) { + minDistanceBetweenStacks = stackDistance; + } + if ( maxDistanceBetweenStacks == null || stackDistance > maxDistanceBetweenStacks ) { + maxDistanceBetweenStacks = stackDistance; + } + } + + private void updateReadLength( int readLength ) { + if ( minReadLength == null || readLength < minReadLength ) { + minReadLength = readLength; + } + if ( maxReadLength == null || readLength > maxReadLength ) { + maxReadLength = readLength; + } + } + + private boolean readHasCorrectSample( SAMRecord read ) { + return originalStream.getReadGroupID().equals(read.getAttribute("RG")); + } + + public void finalizeStats() { + if ( lastRead != null && ! lastRead.getReadUnmappedFlag() ) { + stacksPerContig.add(currentContigNumStacks); + updateReadsPerStack(currentStackNumReads); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java new file mode 100644 index 000000000..b30198608 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java @@ -0,0 +1,158 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.threading; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.apache.log4j.Priority; +import org.broadinstitute.sting.utils.AutoFormattingTime; + +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadInfo; +import java.lang.management.ThreadMXBean; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; + +/** + * Creates threads that automatically monitor their efficiency via the parent ThreadEfficiencyMonitor + * + * User: depristo + * Date: 8/14/12 + * Time: 8:47 AM + */ +@Invariant({ + "activeThreads.size() <= nThreadsToCreate", + "countDownLatch.getCount() <= nThreadsToCreate", + "nThreadsCreated <= nThreadsToCreate" +}) +public class EfficiencyMonitoringThreadFactory extends ThreadEfficiencyMonitor implements ThreadFactory { + final int nThreadsToCreate; + final List activeThreads; + + int nThreadsCreated = 0; + + /** + * Counts down the number of active activeThreads whose runtime info hasn't been incorporated into + * times. Counts down from nThreadsToCreate to 0, at which point any code waiting + * on the final times is freed to run. + */ + final CountDownLatch countDownLatch; + + /** + * Create a new factory generating threads whose runtime and contention + * behavior is tracked in this factory. + * + * @param nThreadsToCreate the number of threads we will create in the factory before it's considered complete + */ + public EfficiencyMonitoringThreadFactory(final int nThreadsToCreate) { + super(); + if ( nThreadsToCreate <= 0 ) throw new IllegalArgumentException("nThreadsToCreate <= 0: " + nThreadsToCreate); + + this.nThreadsToCreate = nThreadsToCreate; + activeThreads = new ArrayList(nThreadsToCreate); + countDownLatch = new CountDownLatch(nThreadsToCreate); + } + + /** + * How many threads have been created by this factory so far? + * @return + */ + @Ensures("result >= 0") + public int getNThreadsCreated() { + return nThreadsCreated; + } + + /** + * Only useful for testing, so that we can wait for all of the threads in the factory to complete running + * + * @throws InterruptedException + */ + protected void waitForAllThreadsToComplete() throws InterruptedException { + countDownLatch.await(); + } + + @Ensures({ + "activeThreads.size() <= old(activeThreads.size())", + "! activeThreads.contains(thread)", + "countDownLatch.getCount() <= old(countDownLatch.getCount())" + }) + @Override + public synchronized void threadIsDone(final Thread thread) { + nThreadsAnalyzed++; + + if ( DEBUG ) logger.warn(" Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); + + super.threadIsDone(thread); + + // remove the thread from the list of active activeThreads, if it's in there, and decrement the countdown latch + if ( activeThreads.remove(thread) ) { + // one less thread is live for those blocking on all activeThreads to be complete + countDownLatch.countDown(); + if ( DEBUG ) logger.warn(" -> Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); + } + } + + /** + * Create a new thread from this factory + * + * @param runnable + * @return + */ + @Override + @Ensures({ + "activeThreads.size() > old(activeThreads.size())", + "activeThreads.contains(result)", + "nThreadsCreated == old(nThreadsCreated) + 1" + }) + public synchronized Thread newThread(final Runnable runnable) { + if ( activeThreads.size() >= nThreadsToCreate) + throw new IllegalStateException("Attempting to create more activeThreads than allowed by constructor argument nThreadsToCreate " + nThreadsToCreate); + + nThreadsCreated++; + final Thread myThread = new TrackingThread(runnable); + activeThreads.add(myThread); + return myThread; + } + + /** + * A wrapper around Thread that tracks the runtime of the thread and calls threadIsDone() when complete + */ + private class TrackingThread extends Thread { + private TrackingThread(Runnable runnable) { + super(runnable); + } + + @Override + public void run() { + super.run(); + threadIsDone(this); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java new file mode 100644 index 000000000..b25375b87 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java @@ -0,0 +1,26 @@ +package org.broadinstitute.sting.utils.threading; + +import java.util.concurrent.ThreadFactory; + +/** + * Thread factor that produces threads with a given name pattern + * + * User: depristo + * Date: 9/5/12 + * Time: 9:22 PM + * + */ +public class NamedThreadFactory implements ThreadFactory { + static int id = 0; + final String format; + + public NamedThreadFactory(String format) { + this.format = format; + String.format(format, id); // test the name + } + + @Override + public Thread newThread(Runnable r) { + return new Thread(r, String.format(format, id++)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java new file mode 100644 index 000000000..9159f5657 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java @@ -0,0 +1,207 @@ +package org.broadinstitute.sting.utils.threading; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.apache.log4j.Priority; +import org.broadinstitute.sting.utils.AutoFormattingTime; + +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadInfo; +import java.lang.management.ThreadMXBean; +import java.util.EnumMap; +import java.util.concurrent.TimeUnit; + +/** + * Uses an MXBean to monitor thread efficiency + * + * Once the monitor is created, calls to threadIsDone() can be used to add information + * about the efficiency of the provided thread to this monitor. + * + * Provides simple print() for displaying efficiency information to a logger + * + * User: depristo + * Date: 8/22/12 + * Time: 10:48 AM + */ +@Invariant({"nThreadsAnalyzed >= 0"}) +public class ThreadEfficiencyMonitor { + protected static final boolean DEBUG = false; + protected static Logger logger = Logger.getLogger(EfficiencyMonitoringThreadFactory.class); + final EnumMap times = new EnumMap(State.class); + + /** + * The number of threads we've included in our efficiency monitoring + */ + int nThreadsAnalyzed = 0; + + /** + * The bean used to get the thread info about blocked and waiting times + */ + final ThreadMXBean bean; + + public ThreadEfficiencyMonitor() { + bean = ManagementFactory.getThreadMXBean(); + + // get the bean, and start tracking + if ( bean.isThreadContentionMonitoringSupported() ) + bean.setThreadContentionMonitoringEnabled(true); + else + logger.warn("Thread contention monitoring not supported, we cannot track GATK multi-threaded efficiency"); + //bean.setThreadCpuTimeEnabled(true); + + if ( bean.isThreadCpuTimeSupported() ) + bean.setThreadCpuTimeEnabled(true); + else + logger.warn("Thread CPU monitoring not supported, we cannot track GATK multi-threaded efficiency"); + + // initialize times to 0 + for ( final State state : State.values() ) + times.put(state, 0l); + } + + private static long nanoToMilli(final long timeInNano) { + return TimeUnit.NANOSECONDS.toMillis(timeInNano); + } + + /** + * Get the time spent in state across all threads created by this factory + * + * @param state to get information about + * @return the time in milliseconds + */ + @Ensures({"result >= 0"}) + public synchronized long getStateTime(final State state) { + return times.get(state); + } + + /** + * Get the total time spent in all states across all threads created by this factory + * + * @return the time in milliseconds + */ + @Ensures({"result >= 0"}) + public synchronized long getTotalTime() { + long total = 0; + for ( final long time : times.values() ) + total += time; + return total; + } + + /** + * Get the fraction of time spent in state across all threads created by this factory + * + * @return the percentage (0.0-100.0) of time spent in state over all state times of all threads + */ + @Ensures({"result >= 0.0", "result <= 100.0"}) + public synchronized double getStatePercent(final State state) { + return (100.0 * getStateTime(state)) / Math.max(getTotalTime(), 1); + } + + public int getnThreadsAnalyzed() { + return nThreadsAnalyzed; + } + + @Override + public synchronized String toString() { + final StringBuilder b = new StringBuilder(); + + b.append("total ").append(getTotalTime()).append(" "); + for ( final State state : State.values() ) { + b.append(state).append(" ").append(getStateTime(state)).append(" "); + } + + return b.toString(); + } + + /** + * Print usage information about threads from this factory to logger + * with the INFO priority + * + * @param logger + */ + public synchronized void printUsageInformation(final Logger logger) { + printUsageInformation(logger, Priority.INFO); + } + + /** + * Print usage information about threads from this factory to logger + * with the provided priority + * + * @param logger + */ + public synchronized void printUsageInformation(final Logger logger, final Priority priority) { + logger.debug("Number of threads monitored: " + getnThreadsAnalyzed()); + logger.debug("Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toSeconds(getTotalTime()))); + for ( final State state : State.values() ) { + logger.debug(String.format("\tPercent of time spent %s is %.2f", state.getUserFriendlyName(), getStatePercent(state))); + } + logger.log(priority, String.format("CPU efficiency : %6.2f%% of time spent %s", getStatePercent(State.USER_CPU), State.USER_CPU.getUserFriendlyName())); + logger.log(priority, String.format("Walker inefficiency : %6.2f%% of time spent %s", getStatePercent(State.BLOCKING), State.BLOCKING.getUserFriendlyName())); + logger.log(priority, String.format("I/O inefficiency : %6.2f%% of time spent %s", getStatePercent(State.WAITING_FOR_IO), State.WAITING_FOR_IO.getUserFriendlyName())); + logger.log(priority, String.format("Thread inefficiency : %6.2f%% of time spent %s", getStatePercent(State.WAITING), State.WAITING.getUserFriendlyName())); + } + + /** + * Update the information about completed thread that ran for runtime in milliseconds + * + * This method updates all of the key timing and tracking information in the factory so that + * thread can be retired. After this call the factory shouldn't have a pointer to the thread any longer + * + * @param thread the thread whose information we are updating + */ + @Ensures({ + "getTotalTime() >= old(getTotalTime())" + }) + public synchronized void threadIsDone(final Thread thread) { + nThreadsAnalyzed++; + + if ( DEBUG ) logger.warn("UpdateThreadInfo called"); + + final long threadID = thread.getId(); + final ThreadInfo info = bean.getThreadInfo(thread.getId()); + final long totalTimeNano = bean.getThreadCpuTime(threadID); + final long userTimeNano = bean.getThreadUserTime(threadID); + final long systemTimeNano = totalTimeNano - userTimeNano; + final long userTimeInMilliseconds = nanoToMilli(userTimeNano); + final long systemTimeInMilliseconds = nanoToMilli(systemTimeNano); + + if ( info != null ) { + if ( DEBUG ) logger.warn("Updating thread with user runtime " + userTimeInMilliseconds + " and system runtime " + systemTimeInMilliseconds + " of which blocked " + info.getBlockedTime() + " and waiting " + info.getWaitedTime()); + incTimes(State.BLOCKING, info.getBlockedTime()); + incTimes(State.WAITING, info.getWaitedTime()); + incTimes(State.USER_CPU, userTimeInMilliseconds); + incTimes(State.WAITING_FOR_IO, systemTimeInMilliseconds); + } + } + + /** + * Helper function that increments the times counter by by for state + * + * @param state + * @param by + */ + @Requires({"state != null", "by >= 0"}) + @Ensures("getTotalTime() == old(getTotalTime()) + by") + private synchronized void incTimes(final State state, final long by) { + times.put(state, times.get(state) + by); + } + + public enum State { + BLOCKING("blocking on synchronized data structures"), + WAITING("waiting on some other thread"), + USER_CPU("doing productive CPU work"), + WAITING_FOR_IO("waiting for I/O"); + + private final String userFriendlyName; + + private State(String userFriendlyName) { + this.userFriendlyName = userFriendlyName; + } + + public String getUserFriendlyName() { + return userFriendlyName; + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java new file mode 100644 index 000000000..924c6ec5a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/31/12 + * Time: 11:03 AM + * To change this template use File | Settings | File Templates. + */ +public class InvalidArgumentIntegrationTest extends WalkerTest { + private static final String callsB36 = BaseTest.validationDataLocation + "lowpass.N3.chr1.raw.vcf"; + + private WalkerTest.WalkerTestSpec baseTest(String flag, String arg, Class exeption) { + return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s " + flag + " " + arg, + 1, exeption); + + } + + @Test + public void testUnknownReadFilter() { + executeTest("UnknownReadFilter",baseTest("-rf","TestUnknownReadFilter", UserException.MalformedReadFilterException.class)); + } + + @Test + public void testMalformedWalkerArgs() { + executeTest("MalformedWalkerArgs", + new WalkerTest.WalkerTestSpec("-T UnknownWalkerName -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s ", + 1, UserException.MalformedWalkerArgumentsException.class)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java new file mode 100644 index 000000000..2717d014c --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +public class LevelingDownsamplerUnitTest extends BaseTest { + + private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { + public enum DataStructure { LINKED_LIST, ARRAY_LIST } + + int targetSize; + int numStacks; + int stackSize; + DataStructure dataStructure; + int expectedSize; + + public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { + super(LevelingDownsamplerUniformStacksTest.class); + + this.targetSize = targetSize; + this.numStacks = numStacks; + this.stackSize = stackSize; + this.dataStructure = dataStructure; + expectedSize = calculateExpectedDownsampledStackSize(); + + setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", + getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); + } + + public Collection> createStacks() { + Collection> stacks = new ArrayList>(); + + for ( int i = 1; i <= numStacks; i++ ) { + List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); + + for ( int j = 1; j <= stackSize; j++ ) { + stack.add(new Object()); + } + + stacks.add(stack); + } + + return stacks; + } + + private int calculateExpectedDownsampledStackSize() { + int numItemsToRemove = numStacks * stackSize - targetSize; + + if ( numStacks == 0 ) { + return 0; + } + else if ( numItemsToRemove <= 0 ) { + return stackSize; + } + + return Math.max(1, stackSize - (numItemsToRemove / numStacks)); + } + } + + @DataProvider(name = "UniformStacksDataProvider") + public Object[][] createUniformStacksTestData() { + for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { + for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { + new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); + } + } + } + } + + return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); + } + + @Test( dataProvider = "UniformStacksDataProvider" ) + public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); + + downsampler.submit(test.createStacks()); + + if ( test.numStacks > 0 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List> downsampledStacks = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledStacks.size(), test.numStacks); + + int totalRemainingItems = 0; + for ( List stack : downsampledStacks ) { + Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); + totalRemainingItems += stack.size(); + } + + int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); + int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; + + Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + + Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..b9022900b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialMultiSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest { + + private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider { + + // TODO: tests should distinguish between variance across samples and variance within a sample + + private enum StreamDensity { + SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2), + DENSE (1, MIN_READ_LENGTH), + MIXED (1, MAX_READ_LENGTH * 2), + UNIFORM_DENSE (1, 1), + UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2); + + int minDistanceBetweenStacks; + int maxDistanceBetweenStacks; + + StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) { + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + } + + public String toString() { + return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + private enum StreamStackDepth { + NON_UNIFORM_LOW (1, 5), + NON_UNIFORM_HIGH (15, 20), + NON_UNIFORM_MIXED (1, 20), + UNIFORM_SINGLE (1, 1), + UNIFORM_LOW (2, 2), + UNIFORM_HIGH (20, 20), + UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing + + int minReadsPerStack; + int maxReadsPerStack; + + StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) { + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + } + + public boolean isUniform() { + return minReadsPerStack == maxReadsPerStack; + } + + public String toString() { + return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack); + } + } + + private enum StreamStacksPerContig { + UNIFORM(20, 20), + NON_UNIFORM(1, 30); + + int minStacksPerContig; + int maxStacksPerContig; + + StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) { + this.minStacksPerContig = minStacksPerContig; + this.maxStacksPerContig = maxStacksPerContig; + } + + public boolean isUniform() { + return minStacksPerContig == maxStacksPerContig; + } + + public String toString() { + return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig); + } + } + + // Not interested in testing multiple ranges for the read lengths, as none of our current + // downsamplers are affected by read length + private static final int MIN_READ_LENGTH = 50; + private static final int MAX_READ_LENGTH = 150; + + private ReadsDownsamplerFactory downsamplerFactory; + private int targetCoverage; + private int numSamples; + private int minContigs; + private int maxContigs; + private StreamDensity streamDensity; + private StreamStackDepth streamStackDepth; + private StreamStacksPerContig streamStacksPerContig; + private double unmappedReadsFraction; + private int unmappedReadsCount; + private boolean verifySortedness; + + private ArtificialMultiSampleReadStream mergedReadStream; + private Map perSampleArtificialReadStreams; + private Map perSampleStreamAnalyzers; + private SAMFileHeader header; + + public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory downsamplerFactory, + int targetCoverage, + int numSamples, + int minContigs, + int maxContigs, + StreamDensity streamDensity, + StreamStackDepth streamStackDepth, + StreamStacksPerContig streamStacksPerContig, + double unmappedReadsFraction, + int unmappedReadsCount, + boolean verifySortedness ) { + super(PerSampleDownsamplingReadsIteratorTest.class); + + this.downsamplerFactory = downsamplerFactory; + this.targetCoverage = targetCoverage; + this.numSamples = numSamples; + this.minContigs = minContigs; + this.maxContigs = maxContigs; + this.streamDensity = streamDensity; + this.streamStackDepth = streamStackDepth; + this.streamStacksPerContig = streamStacksPerContig; + this.unmappedReadsFraction = unmappedReadsFraction; + this.unmappedReadsCount = unmappedReadsCount; + this.verifySortedness = verifySortedness; + + header = createHeader(); + createReadStreams(); + + setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b", + getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness)); + } + + private SAMFileHeader createHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000); + List readGroups = new ArrayList(numSamples); + List sampleNames = new ArrayList(numSamples); + + for ( int i = 0; i < numSamples; i++ ) { + readGroups.add("ReadGroup" + i); + sampleNames.add("Sample" + i); + } + + return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames); + } + + private void createReadStreams() { + perSampleArtificialReadStreams = new HashMap(numSamples); + perSampleStreamAnalyzers = new HashMap(numSamples); + + for (SAMReadGroupRecord readGroup : header.getReadGroups() ) { + String readGroupID = readGroup.getReadGroupId(); + String sampleName = readGroup.getSample(); + + int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs); + int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig); + + int thisSampleNumUnmappedReads = GenomeAnalysisEngine.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0; + + ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header, + readGroupID, + thisSampleNumContigs, + thisSampleStacksPerContig, + streamStackDepth.minReadsPerStack, + streamStackDepth.maxReadsPerStack, + streamDensity.minDistanceBetweenStacks, + streamDensity.maxDistanceBetweenStacks, + MIN_READ_LENGTH, + MAX_READ_LENGTH, + thisSampleNumUnmappedReads); + perSampleArtificialReadStreams.put(sampleName, thisSampleStream); + perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage)); + } + + mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values()); + } + + public void run() { + StingSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getStingSAMIterator(), downsamplerFactory); + + if ( verifySortedness ) { + downsamplingIter = new VerifyingSamIterator(downsamplingIter); + } + + while ( downsamplingIter.hasNext() ) { + SAMRecord read = downsamplingIter.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName); + if ( analyzer != null ) { + analyzer.update(read); + } + else { + throw new ReviewedStingException("bug: stream analyzer for sample " + sampleName + " not found"); + } + } + + for ( Map.Entry analyzerEntry : perSampleStreamAnalyzers.entrySet() ) { + ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue(); + analyzer.finalizeStats(); + + // Validate the downsampled read stream for each sample individually + analyzer.validate(); + } + + // Allow memory used by this test to be reclaimed: + mergedReadStream = null; + perSampleArtificialReadStreams = null; + perSampleStreamAnalyzers = null; + } + } + + @DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public Object[][] createPerSampleDownsamplingReadsIteratorTests() { + + GenomeAnalysisEngine.resetRandomGenerator(); + + // Some values don't vary across tests + int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack; + ReadsDownsamplerFactory downsamplerFactory = new SimplePositionalDownsamplerFactory(targetCoverage); + int maxContigs = 3; + boolean verifySortedness = true; + + for ( int numSamples : Arrays.asList(1, 2, 10) ) { + for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) { + for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) { + for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) { + for ( int unmappedReadsCount : Arrays.asList(1, 50) ) { + new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory, + targetCoverage, + numSamples, + minContigs, + maxContigs, + streamDensity, + streamStackDepth, + streamStacksPerContig, + unmappedReadsFraction, + unmappedReadsCount, + verifySortedness); + } + } + } + } + } + } + } + + return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..9cbd0db8a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; + +/** + * Class for analyzing an artificial read stream that has been positionally downsampled, and verifying + * that the downsampling was done correctly without changing the stream in unexpected ways. + * + * @author David Roazen + */ +public class PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer extends ArtificialSingleSampleReadStreamAnalyzer { + private int targetCoverage; + + public PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream, int targetCoverage ) { + super(originalStream); + this.targetCoverage = targetCoverage; + } + + /** + * Overridden validate() method that checks for the effects of positional downsampling in addition to checking + * for whether the original properties of the stream not affected by downsampling have been preserved + */ + @Override + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + + // Check for the effects of positional downsampling: + int stackMinimumAfterDownsampling = Math.min(targetCoverage, originalStream.getMinReadsPerStack()); + int stackMaximumAfterDownsampling = targetCoverage; + + if ( minReadsPerStack < stackMinimumAfterDownsampling ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads after downsampling"); + } + if ( maxReadsPerStack > stackMaximumAfterDownsampling ) { + throw new ReviewedStingException("stack had more than the maximum number of reads after downsampling"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..75d0448c4 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class ReservoirDownsamplerUnitTest extends BaseTest { + + private static class ReservoirDownsamplerTest extends TestDataProvider { + int reservoirSize; + int totalReads; + int expectedNumReadsAfterDownsampling; + int expectedNumDiscardedItems; + + public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { + super(ReservoirDownsamplerTest.class); + + this.reservoirSize = reservoirSize; + this.totalReads = totalReads; + + expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); + expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; + + setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", + getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "ReservoirDownsamplerTestDataProvider") + public Object[][] createReservoirDownsamplerTestData() { + for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, 0); + for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, totalReads); + } + } + + return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); + } + + @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") + public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); + Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..5dc41b4a0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +public class SimplePositionalDownsamplerUnitTest extends BaseTest { + + private static class SimplePositionalDownsamplerTest extends TestDataProvider { + int targetCoverage; + int numStacks; + List stackSizes; + List expectedStackSizes; + boolean multipleContigs; + int totalInitialReads; + + public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { + super(SimplePositionalDownsamplerTest.class); + + this.targetCoverage = targetCoverage; + this.numStacks = stackSizes.size(); + this.stackSizes = stackSizes; + this.multipleContigs = multipleContigs; + + calculateExpectedDownsampledStackSizes(); + + totalInitialReads = 0; + for ( Integer stackSize : stackSizes ) { + totalInitialReads += stackSize; + } + + setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", + getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); + } + + public Collection createReads() { + Collection reads = new ArrayList(); + SAMFileHeader header = multipleContigs ? + ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : + ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + int refIndex = 0; + int alignmentStart = 1; + int readLength = 100; + + for ( int i = 0; i < numStacks; i++ ) { + if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { + refIndex++; + } + + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", + refIndex, alignmentStart, readLength)); + + alignmentStart += 10; + } + + return reads; + } + + private void calculateExpectedDownsampledStackSizes() { + expectedStackSizes = new ArrayList(numStacks); + + for ( Integer stackSize : stackSizes ) { + int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; + expectedStackSizes.add(expectedSize); + } + } + } + + @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") + public Object[][] createSimplePositionalDownsamplerTestData() { + GenomeAnalysisEngine.resetRandomGenerator(); + + for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { + for ( int contigs = 1; contigs <= 2; contigs++ ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + List stackSizes = new ArrayList(numStacks); + for ( int stack = 1; stack <= numStacks; stack++ ) { + stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1); + } + new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); + } + } + } + + return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); + } + + @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) + public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); + + downsampler.submit(test.createReads()); + + if ( test.numStacks > 1 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else if ( test.numStacks == 1 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + if ( test.numStacks == 0 ) { + Assert.assertTrue(downsampledReads.isEmpty()); + } + else { + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); + + Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); + Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); + + int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); + int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); + Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); + } + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + + if ( downsampledReads.isEmpty() ) { + return stackSizes; + } + + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } + + @Test + public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); + downsampler.submit(readStack); + + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + + SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); + downsampler.signalNoMoreReadsBefore(laterRead); + + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), readStack.size()); + } + + @Test + public void testBasicUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : readStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(readStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), readStack.size()); + + for ( SAMRecord read: downsampledReads ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + } + + @Test + public void testMixedMappedAndUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection mappedReadStack = new ArrayList(); + mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); + for ( SAMRecord read : mappedReadStack ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + + Collection unmappedReadStack = new ArrayList(); + unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : unmappedReadStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(mappedReadStack); + downsampler.submit(unmappedReadStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), 300); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); + + int count = 1; + for ( SAMRecord read: downsampledReads ) { + if ( count <= 100 ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + else { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + count++; + } + } + + @Test + public void testGATKSAMRecordSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), 10); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java new file mode 100644 index 000000000..c148bcf84 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java @@ -0,0 +1,546 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * testing of the experimental version of LocusIteratorByState + */ +public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { + private static SAMFileHeader header; + private LocusIteratorByStateExperimental li; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + private final LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { + return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); + } + + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { + return new ReadProperties( + Collections.emptyList(), + new SAMFileHeader(), + false, + SAMFileReader.ValidationStringency.STRICT, + downsamplingMethod, + new ValidationExclusion(), + Collections.emptyList(), + Collections.emptyList(), + false, + (byte) -1 + ); + } + + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; + + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() { + return; + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } + } + + @Test + public void testXandEQOperators() { + final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10); + r1.setReadBases(bases1); + r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r1.setCigarString("10M"); + + SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10); + r2.setReadBases(bases2); + r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r2.setCigarString("3=1X5=1X"); + + SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10); + r3.setReadBases(bases2); + r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r3.setCigarString("3=1X5M1X"); + + SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10); + r4.setReadBases(bases2); + r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r4.setCigarString("10M"); + + List reads = Arrays.asList(r1, r2, r3, r4); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 4); + } + } + + @Test + public void testIndelsInRegularPileup() { + final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); + before.setReadBases(bases); + before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + before.setCigarString("10M"); + + SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); + during.setReadBases(indelBases); + during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + during.setCigarString("4M2I6M"); + + SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); + after.setReadBases(bases); + after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + after.setCigarString("10M"); + + List reads = Arrays.asList(before, during, after); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + boolean foundIndel = false; + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); + for (PileupElement p : pileup) { + if (p.isBeforeInsertion()) { + foundIndel = true; + Assert.assertEquals(p.getEventLength(), 2, "Wrong event length"); + Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect"); + break; + } + } + + } + + Assert.assertTrue(foundIndel,"Indel in pileup not found"); + } + + @Test + public void testWholeIndelReadInIsolation() { + final int firstLocus = 44367789; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); + indelOnlyRead.setCigarString("76I"); + + List reads = Arrays.asList(indelOnlyRead); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, readAttributes); + + // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read + // and considers it to be an indel-containing read. + Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); + ReadBackedPileup basePileup = alignmentContext.getBasePileup(); + Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); + Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) do + * not negatively influence the ordering of the pileup. + */ + @Test + public void testWholeIndelRead() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); + leadingRead.setReadBases(Utils.dupBytes((byte)'A',76)); + leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + leadingRead.setCigarString("1M75I"); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + indelOnlyRead.setCigarString("76I"); + + SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76); + fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76)); + fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); + fullMatchAfterIndel.setCigarString("75I1M"); + + List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + int currentLocus = firstLocus; + int numAlignmentContextsFound = 0; + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect"); + + if(currentLocus == firstLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus); + } + else if(currentLocus == secondLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus); + } + + currentLocus++; + numAlignmentContextsFound++; + } + + Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly + */ + @Test + public void testWholeIndelReadRepresentedTest() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); + read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read1.setCigarString("1I"); + + List reads = Arrays.asList(read1); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "A"); + } + + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); + read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); + read2.setCigarString("10I"); + + reads = Arrays.asList(read2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); + } + } + + //////////////////////////////////////////// + // comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////// + + private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; + private static final int IS_BEFORE_DELETION_START_FLAG = 2; + private static final int IS_AFTER_DELETED_BASE_FLAG = 4; + private static final int IS_AFTER_DELETION_END_FLAG = 8; + private static final int IS_BEFORE_INSERTION_FLAG = 16; + private static final int IS_AFTER_INSERTION_FLAG = 32; + private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; + + private static class LIBSTest { + + + final String cigar; + final int readLength; + final List offsets; + final List flags; + + private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { + this.cigar = cigar; + this.readLength = readLength; + this.offsets = offsets; + this.flags = flags; + } + } + + @DataProvider(name = "LIBSTest") + public Object[][] createLIBSTestData() { + return new Object[][]{ + {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, + {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, + //TODO -- uncomment these when LIBS is fixed + //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, + //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, + //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, + {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, + {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} + }; + } + + @Test(dataProvider = "LIBSTest") + public void testLIBS(LIBSTest params) { + final int locus = 44367788; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); + read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); + read.setCigarString(params.cigar); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + + int offset = 0; + while ( li.hasNext() ) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + + final int flag = params.flags.get(offset); + Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); + Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); + + Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); + + offset++; + } + } + + //////////////////////////////////////////////// + // End comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////////// + + + /////////////////////////////////////// + // Read State Manager Tests // + /////////////////////////////////////// + + private class PerSampleReadStateManagerTest extends TestDataProvider { + private List readCountsPerAlignmentStart; + private List reads; + private List> recordStatesByAlignmentStart; + private int removalInterval; + + public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { + super(PerSampleReadStateManagerTest.class); + + this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; + this.removalInterval = removalInterval; + + reads = new ArrayList(); + recordStatesByAlignmentStart = new ArrayList>(); + + setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", + getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); + } + + public void run() { + LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); + LocusIteratorByStateExperimental.ReadStateManager readStateManager = + libs.new ReadStateManager(new ArrayList().iterator()); + LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = + readStateManager.new PerSampleReadStateManager(); + + makeReads(); + + for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { + perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + } + + // read state manager should have the right number of reads + Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); + + Iterator originalReadsIterator = reads.iterator(); + Iterator recordStateIterator = perSampleReadStateManager.iterator(); + int recordStateCount = 0; + int numReadStatesRemoved = 0; + + // Do a first-pass validation of the record state iteration by making sure we get back everything we + // put in, in the same order, doing any requested removals of read states along the way + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + recordStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + SAMRecord originalRead = originalReadsIterator.next(); + + // The read we get back should be literally the same read in memory as we put in + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + + // If requested, remove a read state every removalInterval states + if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { + recordStateIterator.remove(); + numReadStatesRemoved++; + } + } + + Assert.assertFalse(originalReadsIterator.hasNext()); + + // If we removed any read states, do a second pass through the read states to make sure the right + // states were removed + if ( numReadStatesRemoved > 0 ) { + Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); + + originalReadsIterator = reads.iterator(); + recordStateIterator = perSampleReadStateManager.iterator(); + int readCount = 0; + int readStateCount = 0; + + // Match record states with the reads that should remain after removal + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + readStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + + SAMRecord originalRead = originalReadsIterator.next(); + readCount++; + + if ( readCount % removalInterval == 0 ) { + originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded + readCount++; + } + + // The read we get back should be literally the same read in memory as we put in (after accounting for removals) + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + } + + Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); + } + + // Allow memory used by this test to be reclaimed + readCountsPerAlignmentStart = null; + reads = null; + recordStatesByAlignmentStart = null; + } + + private void makeReads() { + int alignmentStart = 1; + + for ( int readsThisStack : readCountsPerAlignmentStart ) { + ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); + ArrayList stackRecordStates = new ArrayList(); + + for ( SAMRecord read : stackReads ) { + stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); + } + + reads.addAll(stackReads); + recordStatesByAlignmentStart.add(stackRecordStates); + } + } + } + + @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") + public Object[][] createPerSampleReadStateManagerTests() { + for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), + Arrays.asList(2), + Arrays.asList(10), + Arrays.asList(1, 1), + Arrays.asList(2, 2), + Arrays.asList(10, 10), + Arrays.asList(1, 10), + Arrays.asList(10, 1), + Arrays.asList(1, 1, 1), + Arrays.asList(2, 2, 2), + Arrays.asList(10, 10, 10), + Arrays.asList(1, 1, 1, 1, 1, 1), + Arrays.asList(10, 10, 10, 10, 10, 10), + Arrays.asList(1, 2, 10, 1, 2, 10) + ) ) { + + for ( int removalInterval : Arrays.asList(0, 2, 3) ) { + new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); + } + } + + return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); + } + + @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") + public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..5b052454a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java @@ -0,0 +1,166 @@ +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.Test; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileHeader; + +import java.util.*; + +/** + * Basic tests to prove the integrity of the reservoir downsampler. + * At the moment, always run tests on SAM records as that's the task + * for which the downsampler was conceived. + * + * @author mhanna + * @version 0.1 + */ +public class LegacyReservoirDownsamplerUnitTest { + private static final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,200); + + + @Test + public void testEmptyIterator() { + ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + Assert.assertTrue(downsampler.isEmpty(),"Downsampler is not empty but should be."); + } + + @Test + public void testOneElementWithPoolSizeOne() { + List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + downsampler.addAll(reads); + + Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); + Collection batchedReads = downsampler.getDownsampledContents(); + Assert.assertEquals(batchedReads.size(), 1, "Downsampler is returning the wrong number of reads"); + Assert.assertSame(batchedReads.iterator().next(), reads.get(0), "Downsampler is returning an incorrect read"); + } + + @Test + public void testOneElementWithPoolSizeGreaterThanOne() { + List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + downsampler.addAll(reads); + + Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); + Collection batchedReads = downsampler.getDownsampledContents(); + Assert.assertEquals(batchedReads.size(), 1, "Downsampler is returning the wrong number of reads"); + Assert.assertSame(batchedReads.iterator().next(), reads.get(0), "Downsampler is returning an incorrect read"); + + } + + @Test + public void testPoolFilledPartially() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + downsampler.addAll(reads); + + Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); + List batchedReads = new ArrayList(downsampler.getDownsampledContents()); + Assert.assertEquals(batchedReads.size(), 3, "Downsampler is returning the wrong number of reads"); + + Assert.assertSame(batchedReads.get(0), reads.get(0), "Downsampler read 1 is incorrect"); + Assert.assertSame(batchedReads.get(1), reads.get(1), "Downsampler read 2 is incorrect"); + Assert.assertSame(batchedReads.get(2), reads.get(2), "Downsampler read 3 is incorrect"); + } + + @Test + public void testPoolFilledExactly() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + downsampler.addAll(reads); + + Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); + List batchedReads = new ArrayList(downsampler.getDownsampledContents()); + Assert.assertEquals(batchedReads.size(), 5, "Downsampler is returning the wrong number of reads"); + Assert.assertSame(batchedReads.iterator().next(), reads.get(0), "Downsampler is returning an incorrect read"); + + Assert.assertSame(batchedReads.get(0), reads.get(0), "Downsampler read 1 is incorrect"); + Assert.assertSame(batchedReads.get(1), reads.get(1), "Downsampler read 2 is incorrect"); + Assert.assertSame(batchedReads.get(2), reads.get(2), "Downsampler read 3 is incorrect"); + Assert.assertSame(batchedReads.get(3), reads.get(3), "Downsampler read 4 is incorrect"); + Assert.assertSame(batchedReads.get(4), reads.get(4), "Downsampler read 5 is incorrect"); + } + + @Test + public void testLargerPileWithZeroElementPool() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(0); + downsampler.addAll(reads); + + Assert.assertTrue(downsampler.isEmpty(),"Downsampler isn't empty but should be"); + List batchedReads = new ArrayList(downsampler.getDownsampledContents()); + Assert.assertEquals(batchedReads.size(), 0, "Downsampler is returning the wrong number of reads"); + } + + @Test + public void testLargerPileWithSingleElementPool() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + downsampler.addAll(reads); + + Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); + List batchedReads = new ArrayList(downsampler.getDownsampledContents()); + Assert.assertEquals(batchedReads.size(), 1, "Downsampler is returning the wrong number of reads"); + Assert.assertTrue(reads.contains(batchedReads.get(0)),"Downsampler is returning a bad read."); + } + + @Test + public void testFillingAcrossLoci() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + downsampler.addAll(reads); + + Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); + List batchedReads = new ArrayList(downsampler.getDownsampledContents()); + Assert.assertEquals(batchedReads.size(), 1, "Downsampler is returning the wrong number of reads"); + Assert.assertEquals(batchedReads.get(0), reads.get(0), "Downsampler is returning an incorrect read."); + + reads.clear(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,2,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,2,76)); + + downsampler.clear(); + downsampler.addAll(reads); + + Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); + batchedReads = new ArrayList(downsampler.getDownsampledContents()); + Assert.assertEquals(batchedReads.size(), 2, "Downsampler is returning the wrong number of reads"); + Assert.assertEquals(batchedReads.get(0), reads.get(0), "Downsampler is returning an incorrect read."); + Assert.assertEquals(batchedReads.get(1), reads.get(1), "Downsampler is returning an incorrect read."); + + reads.clear(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,3,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,3,76)); + + downsampler.clear(); + downsampler.addAll(reads); + + Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); + batchedReads = new ArrayList(downsampler.getDownsampledContents()); + Assert.assertEquals(batchedReads.size(), 2, "Downsampler is returning the wrong number of reads"); + Assert.assertEquals(batchedReads.get(0), reads.get(0), "Downsampler is returning an incorrect read."); + Assert.assertEquals(batchedReads.get(1), reads.get(1), "Downsampler is returning an incorrect read."); + } + +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java new file mode 100644 index 000000000..b3365c13c --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingDeque; + +/** + * UnitTests for the InputProducer + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class InputProducerUnitTest extends BaseTest { + @DataProvider(name = "InputProducerTest") + public Object[][] createInputProducerTest() { + List tests = new ArrayList(); + + for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { + for ( final int queueSize : Arrays.asList(1, 10, 100) ) { + tests.add(new Object[]{ nElements, queueSize }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "InputProducerTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testInputProducer(final int nElements, final int queueSize) throws InterruptedException { + final List elements = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) elements.add(i); + + final LinkedBlockingDeque.InputValue> readQueue = + new LinkedBlockingDeque.InputValue>(queueSize); + + final InputProducer ip = new InputProducer(elements.iterator(), null, readQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + es.submit(ip); + + int lastValue = -1; + int nRead = 0; + while ( true ) { + final int observedQueueSize = readQueue.size(); + Assert.assertTrue(observedQueueSize <= queueSize, + "Reader is enqueuing more elements " + observedQueueSize + " than allowed " + queueSize); + + final InputProducer.InputValue value = readQueue.take(); + if ( value.isLast() ) { + Assert.assertEquals(nRead, nElements, "Number of input values " + nRead + " not all that are expected " + nElements); + Assert.assertEquals(readQueue.size(), 0, "Last queue element found but queue contains more values!"); + break; + } else { + Assert.assertTrue(lastValue < value.getValue(), "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)value.getValue(), expected, "Value observed " + value.getValue() + " not equal to the expected value " + expected); + nRead++; + lastValue = value.getValue(); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java new file mode 100644 index 000000000..47dcc1d5e --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -0,0 +1,182 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.apache.log4j.BasicConfigurator; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +/** + * UnitTests for the NanoScheduler + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class NanoSchedulerUnitTest extends BaseTest { + public static final int NANO_SCHEDULE_MAX_RUNTIME = 60000; + + private static class Map2x implements NSMapFunction { + @Override public Integer apply(Integer input) { return input * 2; } + } + + private static class ReduceSum implements NSReduceFunction { + int prevOne = Integer.MIN_VALUE; + + @Override public Integer apply(Integer one, Integer sum) { + Assert.assertTrue(prevOne < one, "Reduce came in out of order. Prev " + prevOne + " cur " + one); + return one + sum; + } + } + + private static class ProgressCallback implements NSProgressFunction { + int callBacks = 0; + + @Override + public void progress(Integer lastMapInput) { + callBacks++; + } + } + + + private static int sum2x(final int start, final int end) { + int sum = 0; + for ( int i = start; i < end; i++ ) + sum += 2 * i; + return sum; + } + + private static class NanoSchedulerBasicTest extends TestDataProvider { + final int bufferSize, nThreads, start, end, expectedResult; + + public NanoSchedulerBasicTest(final int bufferSize, final int nThreads, final int start, final int end) { + super(NanoSchedulerBasicTest.class); + this.bufferSize = bufferSize; + this.nThreads = nThreads; + this.start = start; + this.end = end; + this.expectedResult = sum2x(start, end); + setName(String.format("%s nt=%d buf=%d start=%d end=%d sum=%d", + getClass().getSimpleName(), nThreads, bufferSize, start, end, expectedResult)); + } + + public Iterator makeReader() { + final List ints = new ArrayList(); + for ( int i = start; i < end; i++ ) + ints.add(i); + return ints.iterator(); + } + + public int nExpectedCallbacks() { + int nElements = Math.max(end - start, 0); + return nElements / bufferSize; + } + + public Map2x makeMap() { return new Map2x(); } + public Integer initReduce() { return 0; } + public ReduceSum makeReduce() { return new ReduceSum(); } + } + + static NanoSchedulerBasicTest exampleTest = null; + @DataProvider(name = "NanoSchedulerBasicTest") + public Object[][] createNanoSchedulerBasicTest() { + for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000) ) { + for ( final int nt : Arrays.asList(1, 2, 4) ) { + for ( final int start : Arrays.asList(0) ) { + for ( final int end : Arrays.asList(0, 1, 2, 11, 10000, 100000) ) { + exampleTest = new NanoSchedulerBasicTest(bufferSize, nt, start, end); + } + } + } + } + + return NanoSchedulerBasicTest.getTests(NanoSchedulerBasicTest.class); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + public void testSingleThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { + logger.warn("Running " + test); + if ( test.nThreads == 1 ) + testNanoScheduler(test); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME, dependsOnMethods = "testSingleThreadedNanoScheduler") + public void testMultiThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { + logger.warn("Running " + test); + if ( test.nThreads >= 1 ) + testNanoScheduler(test); + } + + private void testNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { + final NanoScheduler nanoScheduler = + new NanoScheduler(test.bufferSize, test.nThreads); + + final ProgressCallback callback = new ProgressCallback(); + nanoScheduler.setProgressFunction(callback); + + Assert.assertEquals(nanoScheduler.getInputBufferSize(), test.bufferSize, "inputBufferSize argument"); + Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); + + final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); + Assert.assertNotNull(sum); + Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + + Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected. Expected at least " + test.nExpectedCallbacks() + " but saw only " + callback.callBacks); + nanoScheduler.shutdown(); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { + if ( test.bufferSize > 1) { + logger.warn("Running " + test); + + final NanoScheduler nanoScheduler = + new NanoScheduler(test.bufferSize, test.nThreads); + + // test reusing the scheduler + for ( int i = 0; i < 10; i++ ) { + final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); + Assert.assertNotNull(sum); + Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + } + + nanoScheduler.shutdown(); + } + } + + @Test(timeOut = NANO_SCHEDULE_MAX_RUNTIME) + public void testShutdown() throws InterruptedException { + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); + Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); + nanoScheduler.shutdown(); + Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); + } + + @Test(expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) + public void testShutdownExecuteFailure() throws InterruptedException { + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); + nanoScheduler.shutdown(); + nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); + } + + public static void main(String [ ] args) { + org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); + BasicConfigurator.configure(); + logger.setLevel(org.apache.log4j.Level.DEBUG); + + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); + final NanoScheduler nanoScheduler = + new NanoScheduler(test.bufferSize, test.nThreads); + nanoScheduler.setDebug(true); + + final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); + System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); + nanoScheduler.shutdown(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java new file mode 100644 index 000000000..61d1330bc --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java @@ -0,0 +1,94 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.*; + +/** + * UnitTests for the InputProducer + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ReducerThreadUnitTest extends BaseTest { + @DataProvider(name = "ReducerThreadTest") + public Object[][] createReducerThreadTest() { + List tests = new ArrayList(); + + for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { + tests.add(new Object[]{ nElements }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "ReducerThreadTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testReducerThreadTest(final int nElements) throws Exception { + List values = new ArrayList(nElements); + List jobIDs = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) { + values.add(i); + jobIDs.add(i); + } + + runTests(values, jobIDs); + } + + @Test(enabled = true, timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME, expectedExceptions = ExecutionException.class) + public void testReducerThreadTestByJobOrder() throws Exception { + runTests(Arrays.asList(0, 1, 2), Arrays.asList(1, 3, 2)); + } + + private void runTests( final List mapValues, final List jobIDs) throws Exception { + final LinkedBlockingDeque>> mapResultsQueue = + new LinkedBlockingDeque>>(mapValues.size()+1); + + for ( int i = 0; i < mapValues.size(); i++ ) { + final int value = mapValues.get(i); + final int jobID = jobIDs.get(i); + final MapResult mapResult = new MapResult(value, jobID); + mapResultsQueue.add(new FutureValue>(mapResult)); + } + mapResultsQueue.add(new FutureValue>(new MapResult())); + + final ReduceSumTest reduce = new ReduceSumTest(mapResultsQueue); + final ReducerThread thread + = new ReducerThread(reduce, null, 0, mapResultsQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + final Future value = es.submit(thread); + value.get(); + + Assert.assertEquals(reduce.nRead, mapValues.size()); + } + + public class ReduceSumTest implements NSReduceFunction { + final LinkedBlockingDeque>> mapResultsQueue; + int nRead = 0; + int lastValue = -1; + + public ReduceSumTest(LinkedBlockingDeque>> mapResultsQueue) { + this.mapResultsQueue = mapResultsQueue; + } + + @Override public Integer apply(Integer one, Integer sum) { + Assert.assertTrue(lastValue < one, "Reduce came in out of order. Prev " + lastValue + " cur " + one); + + Assert.assertTrue(lastValue < one, "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)one, expected, "Value observed " + one + " not equal to the expected value " + expected); + nRead++; + lastValue = expected; + + return one + sum; + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java new file mode 100644 index 000000000..74626d031 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java @@ -0,0 +1,161 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import org.broadinstitute.sting.BaseTest; + +public class ArtificialSingleSampleReadStreamUnitTest extends BaseTest { + + private static class ArtificialSingleSampleReadStreamTest extends TestDataProvider { + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; + + public ArtificialSingleSampleReadStreamTest( ArtificialSingleSampleReadStream stream ) { + super(ArtificialSingleSampleReadStreamTest.class); + + this.stream = stream; + + setName(String.format("%s: numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); + } + + public void run() { + streamAnalyzer= new ArtificialSingleSampleReadStreamAnalyzer(stream); + + streamAnalyzer.analyze(stream); + + // Check whether the observed properties of the stream match its nominal properties + streamAnalyzer.validate(); + } + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamTestDataProvider") + public Object[][] createArtificialSingleSampleReadStreamTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + + GenomeAnalysisEngine.resetRandomGenerator(); + + // brute force testing! + for ( int numContigs = 0; numContigs <= 2; numContigs++ ) { + for ( int stacksPerContig = 0; stacksPerContig <= 2; stacksPerContig++ ) { + for ( int minReadsPerStack = 1; minReadsPerStack <= 2; minReadsPerStack++ ) { + for ( int maxReadsPerStack = 1; maxReadsPerStack <= 3; maxReadsPerStack++ ) { + for ( int minDistanceBetweenStacks = 1; minDistanceBetweenStacks <= 2; minDistanceBetweenStacks++ ) { + for ( int maxDistanceBetweenStacks = 1; maxDistanceBetweenStacks <= 3; maxDistanceBetweenStacks++ ) { + for ( int minReadLength = 1; minReadLength <= 2; minReadLength++ ) { + for ( int maxReadLength = 1; maxReadLength <= 3; maxReadLength++ ) { + for ( int numUnmappedReads = 0; numUnmappedReads <= 2; numUnmappedReads++ ) { + // Only test sane combinations here + if ( minReadsPerStack <= maxReadsPerStack && + minDistanceBetweenStacks <= maxDistanceBetweenStacks && + minReadLength <= maxReadLength && + ((numContigs > 0 && stacksPerContig > 0) || (numContigs == 0 && stacksPerContig == 0)) ) { + + new ArtificialSingleSampleReadStreamTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads)); + } + } + } + } + } + } + } + } + } + } + + return ArtificialSingleSampleReadStreamTest.getTests(ArtificialSingleSampleReadStreamTest.class); + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamTestDataProvider") + public void testArtificialSingleSampleReadStream( ArtificialSingleSampleReadStreamTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider") + public Object[][] createInvalidArgumentsTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + header.addReadGroup(new SAMReadGroupRecord(readGroupID)); + + return new Object[][] { + {"testNullHeader", null, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNullReadGroup", header, null, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidReadGroup", header, "foo", 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumContigs", header, readGroupID, -1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumStacksPerContig", header, readGroupID, 1, -1, 1, 2, 1, 2, 1, 2, 0}, + {"test0ContigsNon0StacksPerContig", header, readGroupID, 0, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNon0Contigs0StacksPerContig", header, readGroupID, 1, 0, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMinReadsPerStack", header, readGroupID, 1, 1, -1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMaxReadsPerStack", header, readGroupID, 1, 1, 1, -2, 1, 2, 1, 2, 0}, + {"testInvalidMinDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, -1, 2, 1, 2, 0}, + {"testInvalidMaxDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, 1, -2, 1, 2, 0}, + {"testInvalidMinReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, -1, 2, 0}, + {"testInvalidMaxReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, -2, 0}, + {"testInvalidReadsPerStackRange", header, readGroupID, 1, 1, 2, 1, 1, 2, 1, 2, 0}, + {"testInvalidDistanceBetweenStacksRange", header, readGroupID, 1, 1, 1, 2, 2, 1, 1, 2, 0}, + {"testInvalidReadLengthRange", header, readGroupID, 1, 1, 1, 2, 1, 2, 2, 1, 0}, + {"testInvalidNumUnmappedReads", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, -1}, + }; + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider", + expectedExceptions = ReviewedStingException.class) + public void testInvalidArguments( String testName, + SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + + logger.warn("Running test: " + testName); + + ArtificialSingleSampleReadStream stream = new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + numStacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java new file mode 100755 index 000000000..7381bebc4 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -0,0 +1,184 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.threading; + +import org.apache.log4j.Priority; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +/** + * Tests for the state monitoring thread factory. + */ +public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { + // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing + private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 100000; + private final static int MAX_THREADS = 4; + final static Object GLOBAL_LOCK = new Object(); + + private class StateTest extends TestDataProvider { + private final double TOLERANCE = 0.1; // willing to tolerate a 10% error + + final List statesForThreads; + + public StateTest(final List statesForThreads) { + super(StateTest.class); + this.statesForThreads = statesForThreads; + setName("StateTest " + Utils.join(",", statesForThreads)); + } + + public List getStatesForThreads() { + return statesForThreads; + } + + public int getNStates() { return statesForThreads.size(); } + + public double maxStatePercent(final EfficiencyMonitoringThreadFactory.State state) { return 100*(fraction(state) + TOLERANCE); } + public double minStatePercent(final EfficiencyMonitoringThreadFactory.State state) { return 100*(fraction(state) - TOLERANCE); } + + private double fraction(final EfficiencyMonitoringThreadFactory.State state) { + return Collections.frequency(statesForThreads, state) / (1.0 * statesForThreads.size()); + } + } + + /** + * Test helper threading class that puts the thread into RUNNING, BLOCKED, or WAITING state as + * requested for input argument + */ + private static class StateTestThread implements Callable { + private final EfficiencyMonitoringThreadFactory.State stateToImplement; + + private StateTestThread(final EfficiencyMonitoringThreadFactory.State stateToImplement) { + this.stateToImplement = stateToImplement; + } + + @Override + public Double call() throws Exception { + switch ( stateToImplement ) { + case USER_CPU: + // do some work until we get to THREAD_TARGET_DURATION_IN_MILLISECOND + double sum = 0.0; + final long startTime = System.currentTimeMillis(); + for ( int i = 1; System.currentTimeMillis() - startTime < (THREAD_TARGET_DURATION_IN_MILLISECOND - 1); i++ ) { + sum += Math.log10(i); + } + return sum; + case WAITING: + Thread.currentThread().sleep(THREAD_TARGET_DURATION_IN_MILLISECOND); + return 0.0; + case BLOCKING: + if ( EfficiencyMonitoringThreadFactory.DEBUG ) logger.warn("Blocking..."); + synchronized (GLOBAL_LOCK) { + // the GLOBAL_LOCK must be held by the unit test itself for this to properly block + if ( EfficiencyMonitoringThreadFactory.DEBUG ) logger.warn(" ... done blocking"); + } + return 0.0; + case WAITING_FOR_IO: + // TODO -- implement me + // shouldn't ever get here, throw an exception + throw new ReviewedStingException("WAITING_FOR_IO testing currently not implemented, until we figure out how to force a system call block"); + default: + throw new ReviewedStingException("Unexpected thread test state " + stateToImplement); + } + } + } + + @DataProvider(name = "StateTest") + public Object[][] createStateTest() { + for ( final int nThreads : Arrays.asList(3) ) { + //final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.WAITING_FOR_IO); + final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.USER_CPU, EfficiencyMonitoringThreadFactory.State.WAITING, EfficiencyMonitoringThreadFactory.State.BLOCKING); + //final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.values()); + for (final List states : Utils.makePermutations(allStates, nThreads, true) ) { + //if ( Collections.frequency(states, Thread.State.BLOCKED) > 0) + new StateTest(states); + } + } + + return StateTest.getTests(StateTest.class); + } + + @Test(enabled = true, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) + public void testStateTest(final StateTest test) throws InterruptedException { + // allows us to test blocking + final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates()); + final ExecutorService threadPool = Executors.newFixedThreadPool(test.getNStates(), factory); + + logger.warn("Running " + test); + synchronized (GLOBAL_LOCK) { + //logger.warn(" Have lock"); + for ( final EfficiencyMonitoringThreadFactory.State threadToRunState : test.getStatesForThreads() ) + threadPool.submit(new StateTestThread(threadToRunState)); + + // lock has to be here for the whole running of the activeThreads but end before the sleep so the blocked activeThreads + // can block for their allotted time + threadPool.shutdown(); + Thread.sleep(THREAD_TARGET_DURATION_IN_MILLISECOND); + } + //logger.warn(" Releasing lock"); + threadPool.awaitTermination(10, TimeUnit.SECONDS); + //logger.warn(" done awaiting termination"); + //logger.warn(" waiting for all activeThreads to complete"); + factory.waitForAllThreadsToComplete(); + //logger.warn(" done waiting for activeThreads"); + + // make sure we counted everything properly + final long totalTime = factory.getTotalTime(); + final long minTime = (long)(THREAD_TARGET_DURATION_IN_MILLISECOND * 0.5) * test.getNStates(); + final long maxTime = (long)(THREAD_TARGET_DURATION_IN_MILLISECOND * 1.5) * test.getNStates(); + //logger.warn("Testing total time"); + Assert.assertTrue(totalTime >= minTime, "Factory results not properly accumulated: totalTime = " + totalTime + " < minTime = " + minTime); + Assert.assertTrue(totalTime <= maxTime, "Factory results not properly accumulated: totalTime = " + totalTime + " > maxTime = " + maxTime); + + for (final EfficiencyMonitoringThreadFactory.State state : EfficiencyMonitoringThreadFactory.State.values() ) { + final double min = test.minStatePercent(state); + final double max = test.maxStatePercent(state); + final double obs = factory.getStatePercent(state); +// logger.warn(" Checking " + state +// + " min " + String.format("%.2f", min) +// + " max " + String.format("%.2f", max) +// + " obs " + String.format("%.2f", obs) +// + " factor = " + factory); + Assert.assertTrue(obs >= min, "Too little time spent in state " + state + " obs " + obs + " min " + min); + Assert.assertTrue(obs <= max, "Too much time spent in state " + state + " obs " + obs + " max " + min); + } + + // we actually ran the expected number of activeThreads + Assert.assertEquals(factory.getNThreadsCreated(), test.getNStates()); + + // should be called to ensure we don't format / NPE on output + factory.printUsageInformation(logger, Priority.WARN); + } +} \ No newline at end of file From 4a84ff4fcecadc2db57d265876bee32d034e6fc4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 8 Sep 2012 19:45:56 -0400 Subject: [PATCH 07/19] Fix a nasty bug in reading GATK reports with a single line -- Old version would break during reading with (as usual) a cryptic error message -- Fixed by avoiding collapsing into a single vector type from a matrix when you subset to a single row. I believe this code confirms thats R is truly the worst programming language ever --- .../sting/utils/R/gsalib/R/gsa.read.gatkreport.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R index 4c228ccb4..eba94c0cb 100644 --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R @@ -111,7 +111,13 @@ gsa.read.gatkreportv1 <- function(lines) { headerRowCount = -1; finishTable <- function() { - .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows[1:rowCount,], tableEnv); + if ( rowCount == 1 ) + # good I hate R. Work around to avoid collapsing into an unstructured vector when + # there's only 1 row + sub <- t(as.matrix(tableRows[1:rowCount,])) + else + sub <- tableRows[1:rowCount,] + .gsa.assignGATKTableToEnvironment(tableName, tableHeader, sub, tableEnv); } for (line in lines) { From 2e94a0a201f1d024207134b2eed36068300aab0c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 8 Sep 2012 20:17:15 -0400 Subject: [PATCH 08/19] Refactor TraversalEngine to extract the progress meter functions -- Previously these core progress metering functions were all in TraversalEngine, and available to subclasses like TraverseLoci via inheritance. The problem here is that the upcoming data threads x cpu threads parallelism requires one master copy of the progress metering shared among all traversals, but multiple instantiations of traverse engines themselves. -- Because the progress metering code has horrible anyway, I've refactored and vastly cleaned up and simplified all of these capabilities into TraversalProgressMeter class. I've simplified down the classes it uses to work (STILL SOME TODOs in there) so that it doesn't reach into the core GATK engine all the time. It should be possible to write some nice tests for it now. By making it its own class, it can protect itself from multi-threaded access with a single synchronized printProgress function instead of carrying around multiple lock objects as before -- Cleaned up the start up of the progress meter. It's now handled when the meter is created, so each micro scheduler doesn't have to deal with proper initialization timing any longer -- Simplified and made clear the interface for shutting down the traversal engines. There's no a shutdown method in TraversalEngine that's called once by the MicroScheduler when the entire traversing in over. Nano traversals now properly shut down (was subtle bug I undercovered here). The printing of on traversal done metering is now handled by MicroScheduler -- The MicroScheduler holds the single master copy of the progress meter, and doles it out to the TraversalEngines (currently 1 but in future commit there will be N). -- Added a nice function to GenomeAnalysisEngine that returns the regions we will be processing, either the intervals requested or the whole genome. Useful for progress meter but also probably for other infrastructure as well -- Remove a lot of the sh*ting Bean interface getting and setting in MicroScheduler that's no longer useful. The generic bean is just a shell interface with nothing in it. -- By removing a lot of these bean accessors and setters many things are now final that used to be dynamic. --- .../sting/gatk/GenomeAnalysisEngine.java | 31 +- .../executive/HierarchicalMicroScheduler.java | 1 - .../HierarchicalMicroSchedulerMBean.java | 2 +- .../gatk/executive/LinearMicroScheduler.java | 3 - .../sting/gatk/executive/MicroScheduler.java | 55 +-- .../gatk/executive/MicroSchedulerMBean.java | 24 +- .../sting/gatk/executive/ShardTraverser.java | 2 - .../gatk/traversals/TraversalEngine.java | 309 ++------------- .../traversals/TraversalProgressMeter.java | 367 ++++++++++++++++++ .../traversals/TraverseActiveRegions.java | 2 +- .../gatk/traversals/TraverseDuplicates.java | 2 +- .../gatk/traversals/TraverseLociBase.java | 2 +- .../gatk/traversals/TraverseLociNano.java | 3 +- .../gatk/traversals/TraverseReadPairs.java | 2 +- .../sting/gatk/traversals/TraverseReads.java | 2 +- .../gatk/traversals/TraverseReadsNano.java | 5 +- .../utils/sam/ArtificialReadsTraversal.java | 2 +- .../traversals/TraverseReadsUnitTest.java | 2 - 18 files changed, 441 insertions(+), 375 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalProgressMeter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 3ce8a92b7..516ea8451 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk; +import com.google.java.contract.Ensures; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.SAMFileHeader; @@ -682,14 +683,14 @@ public class GenomeAnalysisEngine { // if include argument isn't given, create new set of all possible intervals - Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( + final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( this.referenceDataSource, argCollection.intervals, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, argCollection.excludeIntervals); - GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); - GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); + final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); // if no exclude arguments, can return parseIntervalArguments directly if ( excludeSortedSet == null ) @@ -700,13 +701,15 @@ public class GenomeAnalysisEngine { intervals = includeSortedSet.subtractRegions(excludeSortedSet); // logging messages only printed when exclude (-XL) arguments are given - long toPruneSize = includeSortedSet.coveredSize(); - long toExcludeSize = excludeSortedSet.coveredSize(); - long intervalSize = intervals.coveredSize(); + final long toPruneSize = includeSortedSet.coveredSize(); + final long toExcludeSize = excludeSortedSet.coveredSize(); + final long intervalSize = intervals.coveredSize(); logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); } + + logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize())); } /** @@ -981,6 +984,22 @@ public class GenomeAnalysisEngine { return this.intervals; } + /** + * Get the list of regions of the genome being processed. If the user + * requested specific intervals, return those, otherwise return regions + * corresponding to the entire genome. Never returns null. + * + * @return a non-null set of intervals being processed + */ + @Ensures("result != null") + public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() { + if ( getIntervals() == null ) + // if we don't have any intervals defined, create intervals from the reference itself + return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary()); + else + return getIntervals(); + } + /** * Gets the list of filters employed by this engine. * @return Collection of filters (actual instances) used by this engine. diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index f1d2f7b5b..486e83e60 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -186,7 +186,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar outputTracker.bypassThreadLocalStorage(true); try { walker.onTraversalDone(result); - printOnTraversalDone(result); } finally { outputTracker.bypassThreadLocalStorage(false); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java index 530285db0..87d0ad721 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java @@ -16,7 +16,7 @@ package org.broadinstitute.sting.gatk.executive; * An interface for retrieving runtime statistics about how the hierarchical * microscheduler is behaving. */ -public interface HierarchicalMicroSchedulerMBean extends MicroSchedulerMBean { +public interface HierarchicalMicroSchedulerMBean { /** * How many tree reduces are waiting in the tree reduce queue? * @return Total number of reduces waiting in the tree reduce queue? diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index ceb4a6f9b..697e908fd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -60,7 +60,6 @@ public class LinearMicroScheduler extends MicroScheduler { boolean done = walker.isDone(); int counter = 0; - traversalEngine.startTimersIfNecessary(); for (Shard shard : shardStrategy ) { if ( done || shard == null ) // we ran out of shards that aren't owned break; @@ -95,8 +94,6 @@ public class LinearMicroScheduler extends MicroScheduler { Object result = accumulator.finishTraversal(); - printOnTraversalDone(result); - outputTracker.close(); cleanup(); executionIsDone(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index c6ef9acf1..0e8208594 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -44,6 +44,7 @@ import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import javax.management.JMException; import javax.management.MBeanServer; import javax.management.ObjectName; +import java.io.File; import java.lang.management.ManagementFactory; import java.util.Collection; @@ -89,6 +90,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + final TraversalProgressMeter progressMeter; + /** * MicroScheduler factory function. Create a microscheduler appropriate for reducing the * selected walker. @@ -170,9 +173,12 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { traversalEngine = new TraverseActiveRegions(); } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); - } + } - traversalEngine.initialize(engine); + final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; + this.progressMeter = new TraversalProgressMeter(engine.getCumulativeMetrics(), progressLogFile, + traversalEngine.getTraversalUnits(), engine.getRegionsOfGenomeBeingProcessed()); + traversalEngine.initialize(engine, progressMeter); // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. // To get around this limitation and since we have no job identifier at this point, register a simple counter that @@ -231,18 +237,15 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { return (!reads.isEmpty()) ? reads.seek(shard) : new NullSAMIterator(); } - /** - * Print summary information for the analysis. - * @param sum The final reduce output. - */ - protected void printOnTraversalDone(Object sum) { - traversalEngine.printOnTraversalDone(); - } - /** * Must be called by subclasses when execute is done */ protected void executionIsDone() { + progressMeter.printOnDone(); + + // TODO -- generalize to all local thread copies + traversalEngine.shutdown(); + // Print out the threading efficiency of this HMS, if state monitoring is enabled if ( threadEfficiencyMonitor != null ) { // include the master thread information @@ -269,38 +272,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ public IndexedFastaSequenceFile getReference() { return reference; } - /** - * Gets the filename to which performance data is currently being written. - * @return Filename to which performance data is currently being written. - */ - public String getPerformanceLogFileName() { - return traversalEngine.getPerformanceLogFileName(); - } - - /** - * Set the filename of the log for performance. If set, - * @param fileName filename to use when writing performance data. - */ - public void setPerformanceLogFileName(String fileName) { - traversalEngine.setPerformanceLogFileName(fileName); - } - - /** - * Gets the frequency with which performance data is written. - * @return Frequency, in seconds, of performance log writes. - */ - public long getPerformanceProgressPrintFrequencySeconds() { - return traversalEngine.getPerformanceProgressPrintFrequencySeconds(); - } - - /** - * How often should the performance log message be written? - * @param seconds number of seconds between messages indicating performance frequency. - */ - public void setPerformanceProgressPrintFrequencySeconds(long seconds) { - traversalEngine.setPerformanceProgressPrintFrequencySeconds(seconds); - } - protected void cleanup() { try { mBeanServer.unregisterMBean(mBeanName); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java index e510822b8..8be6b0b62 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java @@ -31,27 +31,5 @@ package org.broadinstitute.sting.gatk.executive; * To change this template use File | Settings | File Templates. */ public interface MicroSchedulerMBean { - /** - * Gets the filename to which performance data is currently being written. - * @return Filename to which performance data is currently being written. - */ - public String getPerformanceLogFileName(); - - /** - * Set the filename of the log for performance. If set, - * @param fileName filename to use when writing performance data. - */ - public void setPerformanceLogFileName(String fileName); - - /** - * Gets the frequency with which performance data is written. - * @return Frequency, in seconds, of performance log writes. - */ - public long getPerformanceProgressPrintFrequencySeconds(); - - /** - * How often should the performance log message be written? - * @param seconds number of seconds between messages indicating performance frequency. - */ - public void setPerformanceProgressPrintFrequencySeconds(long seconds); + // has nothing because we don't have anything we currently track } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index aefa9c12d..790c6b3ed 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -10,7 +10,6 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; /** * User: hanna * Date: Apr 29, 2009 @@ -56,7 +55,6 @@ public class ShardTraverser implements Callable { public Object call() { try { - traversalEngine.startTimersIfNecessary(); final long startTime = System.currentTimeMillis(); Object accumulator = walker.reduceInit(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 8c617e4dc..159343bf8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -30,66 +30,28 @@ import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; public abstract class TraversalEngine,ProviderType extends ShardDataProvider> { /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraversalEngine.class); - // Time in milliseconds since we initialized this engine - private static final int HISTORY_WINDOW_SIZE = 50; - - /** lock object to sure updates to history are consistent across threads */ - private static final Object lock = new Object(); - LinkedList history = new LinkedList(); - - /** We use the SimpleTimer to time our run */ - private SimpleTimer timer = null; - - // How long can we go without printing some progress info? - private long lastProgressPrintTime = -1; // When was the last time we printed progress log? - - private final static long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds - private final static double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; - private final static double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; - private long progressPrintFrequency = 10 * 1000; // in milliseconds - private boolean progressMeterInitialized = false; - - // for performance log - private static final boolean PERFORMANCE_LOG_ENABLED = true; - private final Object performanceLogLock = new Object(); - private File performanceLogFile; - private PrintStream performanceLog = null; - private long lastPerformanceLogPrintTime = -1; // When was the last time we printed to the performance log? - private final long PERFORMANCE_LOG_PRINT_FREQUENCY = progressPrintFrequency; // in milliseconds - - /** Size, in bp, of the area we are processing. Updated once in the system in initial for performance reasons */ - long targetSize = -1; - GenomeLocSortedSet targetIntervals = null; - protected GenomeAnalysisEngine engine; + private TraversalProgressMeter progressMeter; // ---------------------------------------------------------------------------------------------------- // // ABSTRACT METHODS // // ---------------------------------------------------------------------------------------------------- + /** - * Gets the named traversal type associated with the given traversal. + * Gets the named traversal type associated with the given traversal, such as loci, reads, etc. + * * @return A user-friendly name for the given traversal type. */ - protected abstract String getTraversalType(); + public abstract String getTraversalUnits(); /** * this method must be implemented by all traversal engines @@ -104,70 +66,36 @@ public abstract class TraversalEngine,Provide ProviderType dataProvider, T sum); - // ---------------------------------------------------------------------------------------------------- - // - // Common timing routines - // - // ---------------------------------------------------------------------------------------------------- /** * Initialize the traversal engine. After this point traversals can be run over the data + * * @param engine GenomeAnalysisEngine for this traversal + * @param progressMeter An optional (null == optional) meter to track our progress */ - public void initialize(GenomeAnalysisEngine engine) { + public void initialize(final GenomeAnalysisEngine engine, final TraversalProgressMeter progressMeter) { if ( engine == null ) throw new ReviewedStingException("BUG: GenomeAnalysisEngine cannot be null!"); this.engine = engine; - - if ( PERFORMANCE_LOG_ENABLED && engine.getArguments() != null && engine.getArguments().performanceLog != null ) { - synchronized(this.performanceLogLock) { - performanceLogFile = engine.getArguments().performanceLog; - createNewPerformanceLog(); - } - } - - // if we don't have any intervals defined, create intervals from the reference itself - if ( this.engine.getIntervals() == null ) - targetIntervals = GenomeLocSortedSet.createSetFromSequenceDictionary(engine.getReferenceDataSource().getReference().getSequenceDictionary()); - else - targetIntervals = this.engine.getIntervals(); - targetSize = targetIntervals.coveredSize(); - } - - private void createNewPerformanceLog() { - synchronized(performanceLogLock) { - try { - performanceLog = new PrintStream(new FileOutputStream(engine.getArguments().performanceLog)); - List pLogHeader = Arrays.asList("elapsed.time", "units.processed", "processing.speed", "bp.processed", "bp.speed", "genome.fraction.complete", "est.total.runtime", "est.time.remaining"); - performanceLog.println(Utils.join("\t", pLogHeader)); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(engine.getArguments().performanceLog, e); - } - } - } - /** - * Should be called to indicate that we're going to process records and the timer should start ticking. This - * function should be called right before any traversal work is done, to avoid counting setup costs in the - * processing costs and inflating the estimated runtime. - */ - public void startTimersIfNecessary() { - if ( timer == null ) { - timer = new SimpleTimer("Traversal"); - timer.start(); - lastProgressPrintTime = timer.currentTime(); - } + this.progressMeter = progressMeter; } /** - * @param curTime (current runtime, in millisecs) - * @param lastPrintTime the last time we printed, in machine milliseconds - * @param printFreq maximum permitted difference between last print and current times + * For testing only. Does not initialize the progress meter * - * @return true if the maximum interval (in millisecs) has passed since the last printing + * @param engine */ - private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) { - long elapsed = curTime - lastPrintTime; - return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; + protected void initialize(final GenomeAnalysisEngine engine) { + initialize(engine, null); + } + + /** + * Called by the MicroScheduler when all work is done and the GATK is shutting down. + * + * To be used by subclasses that need to free up resources (such as threads) + */ + public void shutdown() { + // by default there's nothing to do } /** @@ -197,194 +125,7 @@ public abstract class TraversalEngine,Provide * @param loc the location */ public void printProgress(final GenomeLoc loc) { - // A bypass is inserted here for unit testing. - printProgress(loc, false); - } - - /** - * Utility routine that prints out process information (including timing) every N records or - * every M seconds, for N and M set in global variables. - * - * @param loc Current location, can be null if you are at the end of the traversal - * @param mustPrint If true, will print out info, regardless of nRecords or time interval - */ - private synchronized void printProgress(final GenomeLoc loc, boolean mustPrint) { - if( ! progressMeterInitialized ) { - logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); - logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", - "Location", getTraversalType(), getTraversalType())); - progressMeterInitialized = true; - } - - final long curTime = timer.currentTime(); - boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency); - boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); - - if ( printProgress || printLog ) { - final ProcessingHistory last = updateHistory(loc, engine.getCumulativeMetrics()); - - final AutoFormattingTime elapsed = new AutoFormattingTime(last.elapsedSeconds); - final AutoFormattingTime bpRate = new AutoFormattingTime(last.secondsPerMillionBP()); - final AutoFormattingTime unitRate = new AutoFormattingTime(last.secondsPerMillionElements()); - final double fractionGenomeTargetCompleted = last.calculateFractionGenomeTargetCompleted(targetSize); - final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted); - final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds()); - final long nRecords = engine.getCumulativeMetrics().getNumIterations(); - - if ( printProgress ) { - lastProgressPrintTime = curTime; - - // dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates - if ( estTotalRuntime.getTimeInSeconds() > TWELVE_HOURS_IN_SECONDS ) - progressPrintFrequency = 60 * 1000; // in milliseconds - else if ( estTotalRuntime.getTimeInSeconds() > TWO_HOURS_IN_SECONDS ) - progressPrintFrequency = 30 * 1000; // in milliseconds - else - progressPrintFrequency = 10 * 1000; // in milliseconds - - final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : String.format("%s:%d", loc.getContig(), loc.getStart()); - logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", - posName, nRecords*1.0, elapsed, unitRate, - 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); - - } - - if ( printLog ) { - lastPerformanceLogPrintTime = curTime; - synchronized(performanceLogLock) { - performanceLog.printf("%.2f\t%d\t%.2e\t%d\t%.2e\t%.2e\t%.2f\t%.2f%n", - elapsed.getTimeInSeconds(), nRecords, unitRate.getTimeInSeconds(), last.bpProcessed, - bpRate.getTimeInSeconds(), fractionGenomeTargetCompleted, estTotalRuntime.getTimeInSeconds(), - timeToCompletion.getTimeInSeconds()); - } - } - } - } - - /** - * Keeps track of the last HISTORY_WINDOW_SIZE data points for the progress meter. Currently the - * history isn't used in any way, but in the future it'll become valuable for more accurate estimates - * for when a process will complete. - * - * @param loc our current position. If null, assumes we are done traversing - * @param metrics information about what's been processed already - * @return - */ - private ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) { - synchronized (lock) { - if ( history.size() > HISTORY_WINDOW_SIZE ) - history.pop(); - - long nRecords = metrics.getNumIterations(); - long bpProcessed = loc == null ? targetSize : targetIntervals.sizeBeforeLoc(loc); // null -> end of processing - history.add(new ProcessingHistory(timer.getElapsedTime(), loc, nRecords, bpProcessed)); - - return history.getLast(); - } - } - - /** - * Called after a traversal to print out information about the traversal process - */ - public void printOnTraversalDone() { - printProgress(null, true); - - final double elapsed = timer == null ? 0 : timer.getElapsedTime(); - - ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); - - // count up the number of skipped reads by summing over all filters - long nSkippedReads = 0L; - for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) - nSkippedReads += countsByFilter; - - logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", elapsed, elapsed / 60, elapsed / 3600)); - if ( cumulativeMetrics.getNumReadsSeen() > 0 ) - logger.info(String.format("%d reads were filtered out during traversal out of %d total (%.2f%%)", - nSkippedReads, - cumulativeMetrics.getNumReadsSeen(), - 100.0 * MathUtils.ratio(nSkippedReads,cumulativeMetrics.getNumReadsSeen()))); - for ( Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { - long count = filterCounts.getValue(); - logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", - count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); - } - - if ( performanceLog != null ) performanceLog.close(); - } - - /** - * Gets the filename to which performance data is currently being written. - * @return Filename to which performance data is currently being written. - */ - public String getPerformanceLogFileName() { - synchronized(performanceLogLock) { - return performanceLogFile.getAbsolutePath(); - } - } - - /** - * Sets the filename of the log for performance. If set, will write performance data. - * @param fileName filename to use when writing performance data. - */ - public void setPerformanceLogFileName(String fileName) { - File file = new File(fileName); - - synchronized(performanceLogLock) { - // Ignore multiple calls to reset the same lock. - if(performanceLogFile != null && performanceLogFile.equals(file)) - return; - - // Close an existing log - if(performanceLog != null) performanceLog.close(); - - performanceLogFile = file; - createNewPerformanceLog(); - } - } - - /** - * Gets the frequency with which performance data is written. - * @return Frequency, in seconds, of performance log writes. - */ - public long getPerformanceProgressPrintFrequencySeconds() { - return progressPrintFrequency; - } - - /** - * How often should the performance log message be written? - * @param seconds number of seconds between messages indicating performance frequency. - */ - public void setPerformanceProgressPrintFrequencySeconds(long seconds) { - progressPrintFrequency = seconds; - } - - private static class ProcessingHistory { - double elapsedSeconds; - long unitsProcessed; - long bpProcessed; - GenomeLoc loc; - - public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) { - this.elapsedSeconds = elapsedSeconds; - this.loc = loc; - this.unitsProcessed = unitsProcessed; - this.bpProcessed = bpProcessed; - } - - /** How long in seconds to process 1M traversal units? */ - private double secondsPerMillionElements() { - return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1); - } - - /** How long in seconds to process 1M bp on the genome? */ - private double secondsPerMillionBP() { - return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1); - } - - /** What fractoin of the target intervals have we covered? */ - private double calculateFractionGenomeTargetCompleted(final long targetSize) { - return (1.0*bpProcessed) / targetSize; - } + if ( progressMeter != null ) progressMeter.printProgress(loc); } } + diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalProgressMeter.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalProgressMeter.java new file mode 100755 index 000000000..72f20fb0c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalProgressMeter.java @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.traversals; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.ReadMetrics; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +/** + * A progress meter that prints a few key metrics to a logger and optionally to a file + * + * Metrics include: + * -- Number of processed X (X = traversal units) + * -- Runtime per.1M X + * -- Percent of regions to be processed completed + * -- The estimated total runtime based on previous performance + * -- The estimated time remaining for the entire process + * + * The optional file log an expanded set of metrics in tabular format + * suitable for subsequent analysis in R. + * + * This class is -- and MUST BE -- thread-safe for use in the GATK. Multiple independent + * threads executing traversals will be calling printProgress() simultaneously and this + * class does (and MUST) properly sort out the timings of logs without interlacing outputs + * because of these threads. + * + * Consequently, the fundamental model for when to print the logs is time based. We basically + * print a meter message every X seconds, minutes, hours, whatever is appropriate based on the + * estimated remaining runtime. + * + * @author depristo + * @since 2010 maybe, but written in 09/12 for clarity + */ +@Invariant({ + "targetSizeInBP >= 0", + "progressPrintFrequency > 0" +}) +public class TraversalProgressMeter { + protected static final Logger logger = Logger.getLogger(TraversalProgressMeter.class); + + // -------------------------------------------------------------------------------- + // static constants controlling overall system behavior + // -------------------------------------------------------------------------------- + + /** + * Min. milliseconds after we start up the meter before we will print our first meter message + */ + private final static long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; + + /** + * How often should we print performance logging information, when we are sending this + * information to a file? Not dynamically updated as the logger meter is. + */ + private final static long PERFORMANCE_LOG_PRINT_FREQUENCY = 10 * 1000; + + private final static double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; + private final static double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; + + // -------------------------------------------------------------------------------- + // Variables we updating during running + // -------------------------------------------------------------------------------- + + /** + * When was the last time we printed progress log? In milleseconds + */ + private long lastProgressPrintTime = -1; + + /** + * How frequently should we be printing our meter messages? Dynamically updated + * depending on how long we think the run has left. + */ + private long progressPrintFrequency = 10 * 1000; // default value + + /** + * When was the last time we printed to the performance log? In millseconds + */ + private long lastPerformanceLogPrintTime = -1; + + // -------------------------------------------------------------------------------- + // final variables fixed at object creation time + // -------------------------------------------------------------------------------- + + /** + * The set of genome locs describing the total region we are processing with + * this GATK run. Used to determine how close we are to completing the run + */ + private final GenomeLocSortedSet regionsBeingProcessed; + + /** + * Size, in bp, of the area we are processing, derived from regionsBeingProcessed. + * Updated once in the system in initial for performance reasons + */ + private final long targetSizeInBP; + + /** + * Used to get the total number of records we've processed so far. + */ + final ReadMetrics cumulativeMetrics; + + /** + * A string describing the type of this traversal, so we can say things like + * "we are running at X traversalType per second" + */ + private final String traversalType; + + /** + * A potentially null file where we print a supplementary, R readable performance log + * file. + */ + private final PrintStream performanceLog; + + /** We use the SimpleTimer to time our run */ + private final SimpleTimer timer = new SimpleTimer("Traversal"); + + /** + * Create a new TraversalProgressMeter + * + * @param cumulativeMetrics the object where the shared traversal counts are being updated + * @param performanceLogFile an optional performance log file where a table of performance logs will be written + * @param traversalUnits the name of this traversal type, suitable for saying X seconds per traversalUnits + * @param processingIntervals the intervals being processed + */ + public TraversalProgressMeter(final ReadMetrics cumulativeMetrics, + final File performanceLogFile, + final String traversalUnits, + final GenomeLocSortedSet processingIntervals) { + if ( cumulativeMetrics == null ) throw new IllegalArgumentException("cumulativeMetrics cannot be null!"); + if ( traversalUnits == null ) throw new IllegalArgumentException("traversalUnits cannot be null"); + if ( processingIntervals == null ) throw new IllegalArgumentException("Target intervals cannot be null"); + + this.cumulativeMetrics = cumulativeMetrics; + this.traversalType = traversalUnits; + this.regionsBeingProcessed = processingIntervals; + + // setup the performance logger output, if requested by the GATK engine + if ( performanceLogFile != null ) { + try { + this.performanceLog = new PrintStream(new FileOutputStream(performanceLogFile)); + final List pLogHeader = Arrays.asList("elapsed.time", "units.processed", "processing.speed", + "bp.processed", "bp.speed", "genome.fraction.complete", "est.total.runtime", "est.time.remaining"); + performanceLog.println(Utils.join("\t", pLogHeader)); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(performanceLogFile, e); + } + } else { + performanceLog = null; + } + + // cached for performance reasons + targetSizeInBP = processingIntervals.coveredSize(); + + // start up the timer + start(); + } + + /** + * Forward request to printProgress + * + * Assumes that one cycle has been completed + * + * @param loc the location + */ + public void printProgress(final GenomeLoc loc) { + // A bypass is inserted here for unit testing. + printProgress(loc, false); + } + + private synchronized void start() { + timer.start(); + lastProgressPrintTime = timer.currentTime(); + + logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); + logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", + "Location", traversalType, traversalType)); + } + + /** + * Utility routine that prints out process information (including timing) every N records or + * every M seconds, for N and M set in global variables. + * + * Synchronized to ensure that even with multiple threads calling printProgress we still + * get one clean stream of meter logs. + * + * @param loc Current location, can be null if you are at the end of the traversal + * @param mustPrint If true, will print out info, regardless of time interval + */ + private synchronized void printProgress(final GenomeLoc loc, boolean mustPrint) { + final long curTime = timer.currentTime(); + final boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency); + final boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); + + if ( printProgress || printLog ) { + final ProgressData progressData = takeProgressSnapshot(loc, cumulativeMetrics); + + final AutoFormattingTime elapsed = new AutoFormattingTime(progressData.elapsedSeconds); + final AutoFormattingTime bpRate = new AutoFormattingTime(progressData.secondsPerMillionBP()); + final AutoFormattingTime unitRate = new AutoFormattingTime(progressData.secondsPerMillionElements()); + final double fractionGenomeTargetCompleted = progressData.calculateFractionGenomeTargetCompleted(targetSizeInBP); + final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted); + final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds()); + + if ( printProgress ) { + lastProgressPrintTime = curTime; + updateLoggerPrintFrequency(estTotalRuntime.getTimeInSeconds()); + + // a pretty name for our position + final String posName = loc == null + ? (mustPrint ? "done" : "unmapped reads") + : String.format("%s:%d", loc.getContig(), loc.getStart()); + + logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", + posName, progressData.unitsProcessed*1.0, elapsed, unitRate, + 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); + + } + + if ( printLog ) { + lastPerformanceLogPrintTime = curTime; + performanceLog.printf("%.2f\t%d\t%.2e\t%d\t%.2e\t%.2e\t%.2f\t%.2f%n", + elapsed.getTimeInSeconds(), progressData.unitsProcessed, unitRate.getTimeInSeconds(), + progressData.bpProcessed, bpRate.getTimeInSeconds(), + fractionGenomeTargetCompleted, estTotalRuntime.getTimeInSeconds(), + timeToCompletion.getTimeInSeconds()); + } + } + } + + /** + * Determine, based on remaining runtime, how often to print the meter + * + * @param totalRuntimeSeconds kinda obvious, no? + */ + private void updateLoggerPrintFrequency(final double totalRuntimeSeconds) { + // dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates + if ( totalRuntimeSeconds > TWELVE_HOURS_IN_SECONDS ) + progressPrintFrequency = 60 * 1000; // in milliseconds + else if ( totalRuntimeSeconds > TWO_HOURS_IN_SECONDS ) + progressPrintFrequency = 30 * 1000; // in milliseconds + else + progressPrintFrequency = 10 * 1000; // in milliseconds + } + + /** + * Creates a new ProgressData object recording a snapshot of our progress at this instant + * + * @param loc our current position. If null, assumes we are done traversing + * @param metrics information about what's been processed already + * @return + */ + private ProgressData takeProgressSnapshot(final GenomeLoc loc, final ReadMetrics metrics) { + final long nRecords = metrics.getNumIterations(); + // null -> end of processing + final long bpProcessed = loc == null ? targetSizeInBP : regionsBeingProcessed.sizeBeforeLoc(loc); + return new ProgressData(timer.getElapsedTime(), nRecords, bpProcessed); + } + + /** + * Called after a traversal to print out information about the traversal process + */ + public void printOnDone() { + printProgress(null, true); + + final double elapsed = timer == null ? 0 : timer.getElapsedTime(); + + // count up the number of skipped reads by summing over all filters + long nSkippedReads = 0L; + for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) + nSkippedReads += countsByFilter; + + logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", elapsed, elapsed / 60, elapsed / 3600)); + + // TODO -- move into MicroScheduler + if ( cumulativeMetrics.getNumReadsSeen() > 0 ) + logger.info(String.format("%d reads were filtered out during traversal out of %d total (%.2f%%)", + nSkippedReads, + cumulativeMetrics.getNumReadsSeen(), + 100.0 * MathUtils.ratio(nSkippedReads,cumulativeMetrics.getNumReadsSeen()))); + for ( Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { + long count = filterCounts.getValue(); + logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", + count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); + } + + if ( performanceLog != null ) performanceLog.close(); + } + + /** + * @param curTime (current runtime, in millisecs) + * @param lastPrintTime the last time we printed, in machine milliseconds + * @param printFreq maximum permitted difference between last print and current times + * + * @return true if the maximum interval (in millisecs) has passed since the last printing + */ + private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) { + final long elapsed = curTime - lastPrintTime; + return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; + } + + /** + * a snapshot of our performance, suitable for storage and later analysis + */ + private static class ProgressData { + final double elapsedSeconds; + final long unitsProcessed; + final long bpProcessed; + + @Requires({"unitsProcessed >= 0", "bpProcessed >= 0", "elapsedSeconds >= 0"}) + public ProgressData(double elapsedSeconds, long unitsProcessed, long bpProcessed) { + this.elapsedSeconds = elapsedSeconds; + this.unitsProcessed = unitsProcessed; + this.bpProcessed = bpProcessed; + } + + /** How long in seconds to process 1M traversal units? */ + @Ensures("result >= 0.0") + private double secondsPerMillionElements() { + return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1); + } + + /** How long in seconds to process 1M bp on the genome? */ + @Ensures("result >= 0.0") + private double secondsPerMillionBP() { + return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1); + } + + /** What fraction of the target intervals have we covered? */ + @Requires("targetSize >= 0") + @Ensures({"result >= 0.0", "result <= 1.0"}) + private double calculateFractionGenomeTargetCompleted(final long targetSize) { + return (1.0*bpProcessed) / Math.max(targetSize, 1); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index bbd9346b3..2b7b2f9f5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -36,7 +36,7 @@ public class TraverseActiveRegions extends TraversalEngine myReads = new LinkedHashSet(); @Override - protected String getTraversalType() { + public String getTraversalUnits() { return "active regions"; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 2b45d894c..2e43ef8f8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -54,7 +54,7 @@ public class TraverseDuplicates extends TraversalEngine extends TraversalEngine extends TraverseLociBase { } @Override - public void printOnTraversalDone() { + public void shutdown() { nanoScheduler.shutdown(); - super.printOnTraversalDone(); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java index 9b076fce4..aef3cf7d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java @@ -27,7 +27,7 @@ public class TraverseReadPairs extends TraversalEngine extends TraversalEngine,Read protected static final Logger logger = Logger.getLogger(TraverseReads.class); @Override - protected String getTraversalType() { + public String getTraversalUnits() { return "reads"; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index b3a0a1390..77ab0c891 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -65,7 +65,7 @@ public class TraverseReadsNano extends TraversalEngine, } @Override - protected String getTraversalType() { + public String getTraversalUnits() { return "reads"; } @@ -135,9 +135,8 @@ public class TraverseReadsNano extends TraversalEngine, } @Override - public void printOnTraversalDone() { + public void shutdown() { nanoScheduler.shutdown(); - super.printOnTraversalDone(); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java index 475f7de21..9632a687b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java @@ -69,7 +69,7 @@ public class ArtificialReadsTraversal extends TraversalEngine Date: Sun, 9 Sep 2012 11:00:36 -0400 Subject: [PATCH 10/19] Final cleanup of TraversalProgressMeters, moved to utils.progressmeter -- TraversalProgressMeter now completely generalized, named ProgressMeter in utils.progressmeter. Now just takes "nRecordsProcessed" as an argument to print reads. Completely removes dependence on complex data structures from TraversalProgressMeter. Can be used to measure progress on any task with processing units in genomic locations. -- a fairly simple, class with no dependency on GATK engine or other features. -- Currently only used by the TraversalEngine / MicroScheduler but could be used for any purpose now, really. --- .../sting/gatk/executive/MicroScheduler.java | 45 ++++- .../gatk/traversals/TraversalEngine.java | 10 +- .../progressmeter/ProgressMeter.java} | 163 ++++++------------ .../progressmeter/ProgressMeterData.java | 54 ++++++ 4 files changed, 154 insertions(+), 118 deletions(-) rename public/java/src/org/broadinstitute/sting/{gatk/traversals/TraversalProgressMeter.java => utils/progressmeter/ProgressMeter.java} (65%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 0e8208594..3e843de3e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.executive; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; @@ -37,8 +38,10 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import javax.management.JMException; @@ -47,6 +50,7 @@ import javax.management.ObjectName; import java.io.File; import java.lang.management.ManagementFactory; import java.util.Collection; +import java.util.Map; /** @@ -90,7 +94,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ ThreadEfficiencyMonitor threadEfficiencyMonitor = null; - final TraversalProgressMeter progressMeter; + final ProgressMeter progressMeter; /** * MicroScheduler factory function. Create a microscheduler appropriate for reducing the @@ -176,8 +180,9 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; - this.progressMeter = new TraversalProgressMeter(engine.getCumulativeMetrics(), progressLogFile, - traversalEngine.getTraversalUnits(), engine.getRegionsOfGenomeBeingProcessed()); + this.progressMeter = new ProgressMeter(progressLogFile, + traversalEngine.getTraversalUnits(), + engine.getRegionsOfGenomeBeingProcessed()); traversalEngine.initialize(engine, progressMeter); // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. @@ -241,7 +246,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * Must be called by subclasses when execute is done */ protected void executionIsDone() { - progressMeter.printOnDone(); + progressMeter.notifyDone(engine.getCumulativeMetrics().getNumIterations()); + printReadFilteringStats(); // TODO -- generalize to all local thread copies traversalEngine.shutdown(); @@ -254,6 +260,37 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } } + /** + * Prints out information about number of reads observed and filtering, if any reads were used in the traversal + * + * Looks like: + * + * INFO 10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%) + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing BadMateFilter + * INFO 10:40:47,370 MicroScheduler - -> 20 reads (19.80% of total) failing DuplicateReadFilter + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter + */ + private void printReadFilteringStats() { + final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); + if ( cumulativeMetrics.getNumReadsSeen() > 0 ) { + // count up the number of skipped reads by summing over all filters + long nSkippedReads = 0L; + for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) + nSkippedReads += countsByFilter; + + logger.info(String.format("%d reads were filtered out during traversal out of %d total (%.2f%%)", + nSkippedReads, + cumulativeMetrics.getNumReadsSeen(), + 100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen()))); + + for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { + long count = filterCounts.getValue(); + logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", + count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); + } + } + } + /** * Gets the engine that created this microscheduler. * @return The engine owning this microscheduler. diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 159343bf8..668bddcca 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -32,13 +32,14 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; public abstract class TraversalEngine,ProviderType extends ShardDataProvider> { /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraversalEngine.class); protected GenomeAnalysisEngine engine; - private TraversalProgressMeter progressMeter; + private ProgressMeter progressMeter; // ---------------------------------------------------------------------------------------------------- // @@ -72,7 +73,7 @@ public abstract class TraversalEngine,Provide * @param engine GenomeAnalysisEngine for this traversal * @param progressMeter An optional (null == optional) meter to track our progress */ - public void initialize(final GenomeAnalysisEngine engine, final TraversalProgressMeter progressMeter) { + public void initialize(final GenomeAnalysisEngine engine, final ProgressMeter progressMeter) { if ( engine == null ) throw new ReviewedStingException("BUG: GenomeAnalysisEngine cannot be null!"); @@ -118,14 +119,15 @@ public abstract class TraversalEngine,Provide } /** - * Forward request to printProgress + * Forward request to notifyOfProgress * * Assumes that one cycle has been completed * * @param loc the location */ public void printProgress(final GenomeLoc loc) { - if ( progressMeter != null ) progressMeter.printProgress(loc); + if ( progressMeter != null ) + progressMeter.notifyOfProgress(loc, engine.getCumulativeMetrics().getNumIterations()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalProgressMeter.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java similarity index 65% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalProgressMeter.java rename to public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java index 72f20fb0c..69cf52fc2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalProgressMeter.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java @@ -22,13 +22,10 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.traversals; +package org.broadinstitute.sting.utils.progressmeter; -import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -38,13 +35,17 @@ import java.io.FileOutputStream; import java.io.PrintStream; import java.util.Arrays; import java.util.List; -import java.util.Map; /** - * A progress meter that prints a few key metrics to a logger and optionally to a file + * A meter measuring progress on a calculation through a set of genomic regions that can + * print a few key metrics to a logger and optionally to a file * - * Metrics include: - * -- Number of processed X (X = traversal units) + * The key information for assessing progress is a set of genome locs describing the total + * set of regions we will process. Whenever (at reasonable intervals) the processing unit + * can called notifyOfProgress and this logger may, depending on the metering delay, print + * a log message with the following metrics: + * + * -- Number of processed X (X = processing units) * -- Runtime per.1M X * -- Percent of regions to be processed completed * -- The estimated total runtime based on previous performance @@ -54,7 +55,7 @@ import java.util.Map; * suitable for subsequent analysis in R. * * This class is -- and MUST BE -- thread-safe for use in the GATK. Multiple independent - * threads executing traversals will be calling printProgress() simultaneously and this + * threads executing processors will be calling notifyOfProgress() simultaneously and this * class does (and MUST) properly sort out the timings of logs without interlacing outputs * because of these threads. * @@ -69,8 +70,8 @@ import java.util.Map; "targetSizeInBP >= 0", "progressPrintFrequency > 0" }) -public class TraversalProgressMeter { - protected static final Logger logger = Logger.getLogger(TraversalProgressMeter.class); +public class ProgressMeter { + protected static final Logger logger = Logger.getLogger(ProgressMeter.class); // -------------------------------------------------------------------------------- // static constants controlling overall system behavior @@ -127,15 +128,10 @@ public class TraversalProgressMeter { private final long targetSizeInBP; /** - * Used to get the total number of records we've processed so far. + * A string describing the type of units being processes, so we can say things like + * "we are running at X processingUnitName per second" */ - final ReadMetrics cumulativeMetrics; - - /** - * A string describing the type of this traversal, so we can say things like - * "we are running at X traversalType per second" - */ - private final String traversalType; + private final String processingUnitName; /** * A potentially null file where we print a supplementary, R readable performance log @@ -144,29 +140,25 @@ public class TraversalProgressMeter { private final PrintStream performanceLog; /** We use the SimpleTimer to time our run */ - private final SimpleTimer timer = new SimpleTimer("Traversal"); + private final SimpleTimer timer = new SimpleTimer(); /** - * Create a new TraversalProgressMeter + * Create a new ProgressMeter * - * @param cumulativeMetrics the object where the shared traversal counts are being updated * @param performanceLogFile an optional performance log file where a table of performance logs will be written - * @param traversalUnits the name of this traversal type, suitable for saying X seconds per traversalUnits + * @param processingUnitName the name of the unit type being processed, suitable for saying X seconds per processingUnitName * @param processingIntervals the intervals being processed */ - public TraversalProgressMeter(final ReadMetrics cumulativeMetrics, - final File performanceLogFile, - final String traversalUnits, - final GenomeLocSortedSet processingIntervals) { - if ( cumulativeMetrics == null ) throw new IllegalArgumentException("cumulativeMetrics cannot be null!"); - if ( traversalUnits == null ) throw new IllegalArgumentException("traversalUnits cannot be null"); + public ProgressMeter(final File performanceLogFile, + final String processingUnitName, + final GenomeLocSortedSet processingIntervals) { + if ( processingUnitName == null ) throw new IllegalArgumentException("processingUnitName cannot be null"); if ( processingIntervals == null ) throw new IllegalArgumentException("Target intervals cannot be null"); - this.cumulativeMetrics = cumulativeMetrics; - this.traversalType = traversalUnits; + this.processingUnitName = processingUnitName; this.regionsBeingProcessed = processingIntervals; - // setup the performance logger output, if requested by the GATK engine + // setup the performance logger output, if requested if ( performanceLogFile != null ) { try { this.performanceLog = new PrintStream(new FileOutputStream(performanceLogFile)); @@ -188,45 +180,48 @@ public class TraversalProgressMeter { } /** - * Forward request to printProgress + * Forward request to notifyOfProgress * * Assumes that one cycle has been completed * - * @param loc the location + * @param loc our current location. Null means "in unmapped reads" + * @param nTotalRecordsProcessed the total number of records we've processed */ - public void printProgress(final GenomeLoc loc) { - // A bypass is inserted here for unit testing. - printProgress(loc, false); + public void notifyOfProgress(final GenomeLoc loc, final long nTotalRecordsProcessed) { + notifyOfProgress(loc, false, nTotalRecordsProcessed); } private synchronized void start() { timer.start(); lastProgressPrintTime = timer.currentTime(); - logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); + logger.info("[INITIALIZATION COMPLETE; STARTING PROCESSING]"); logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", - "Location", traversalType, traversalType)); + "Location", processingUnitName, processingUnitName)); } /** * Utility routine that prints out process information (including timing) every N records or * every M seconds, for N and M set in global variables. * - * Synchronized to ensure that even with multiple threads calling printProgress we still + * Synchronized to ensure that even with multiple threads calling notifyOfProgress we still * get one clean stream of meter logs. * - * @param loc Current location, can be null if you are at the end of the traversal + * @param loc Current location, can be null if you are at the end of the processing unit * @param mustPrint If true, will print out info, regardless of time interval + * @param nTotalRecordsProcessed the total number of records we've processed */ - private synchronized void printProgress(final GenomeLoc loc, boolean mustPrint) { + private synchronized void notifyOfProgress(final GenomeLoc loc, boolean mustPrint, final long nTotalRecordsProcessed) { + if ( nTotalRecordsProcessed < 0 ) throw new IllegalArgumentException("nTotalRecordsProcessed must be >= 0"); + final long curTime = timer.currentTime(); final boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency); final boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); if ( printProgress || printLog ) { - final ProgressData progressData = takeProgressSnapshot(loc, cumulativeMetrics); + final ProgressMeterData progressData = takeProgressSnapshot(loc, nTotalRecordsProcessed); - final AutoFormattingTime elapsed = new AutoFormattingTime(progressData.elapsedSeconds); + final AutoFormattingTime elapsed = new AutoFormattingTime(progressData.getElapsedSeconds()); final AutoFormattingTime bpRate = new AutoFormattingTime(progressData.secondsPerMillionBP()); final AutoFormattingTime unitRate = new AutoFormattingTime(progressData.secondsPerMillionElements()); final double fractionGenomeTargetCompleted = progressData.calculateFractionGenomeTargetCompleted(targetSizeInBP); @@ -243,7 +238,7 @@ public class TraversalProgressMeter { : String.format("%s:%d", loc.getContig(), loc.getStart()); logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", - posName, progressData.unitsProcessed*1.0, elapsed, unitRate, + posName, progressData.getUnitsProcessed()*1.0, elapsed, unitRate, 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); } @@ -251,8 +246,8 @@ public class TraversalProgressMeter { if ( printLog ) { lastPerformanceLogPrintTime = curTime; performanceLog.printf("%.2f\t%d\t%.2e\t%d\t%.2e\t%.2e\t%.2f\t%.2f%n", - elapsed.getTimeInSeconds(), progressData.unitsProcessed, unitRate.getTimeInSeconds(), - progressData.bpProcessed, bpRate.getTimeInSeconds(), + elapsed.getTimeInSeconds(), progressData.getUnitsProcessed(), unitRate.getTimeInSeconds(), + progressData.getBpProcessed(), bpRate.getTimeInSeconds(), fractionGenomeTargetCompleted, estTotalRuntime.getTimeInSeconds(), timeToCompletion.getTimeInSeconds()); } @@ -278,44 +273,27 @@ public class TraversalProgressMeter { * Creates a new ProgressData object recording a snapshot of our progress at this instant * * @param loc our current position. If null, assumes we are done traversing - * @param metrics information about what's been processed already + * @param nTotalRecordsProcessed the total number of records we've processed * @return */ - private ProgressData takeProgressSnapshot(final GenomeLoc loc, final ReadMetrics metrics) { - final long nRecords = metrics.getNumIterations(); + private ProgressMeterData takeProgressSnapshot(final GenomeLoc loc, final long nTotalRecordsProcessed) { // null -> end of processing final long bpProcessed = loc == null ? targetSizeInBP : regionsBeingProcessed.sizeBeforeLoc(loc); - return new ProgressData(timer.getElapsedTime(), nRecords, bpProcessed); + return new ProgressMeterData(timer.getElapsedTime(), nTotalRecordsProcessed, bpProcessed); } /** - * Called after a traversal to print out information about the traversal process + * Should be called when processing is done */ - public void printOnDone() { - printProgress(null, true); + public void notifyDone(final long nTotalRecordsProcessed) { + // print out the progress meter + notifyOfProgress(null, true, nTotalRecordsProcessed); - final double elapsed = timer == null ? 0 : timer.getElapsedTime(); + logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", + timer.getElapsedTime(), timer.getElapsedTime() / 60, timer.getElapsedTime() / 3600)); - // count up the number of skipped reads by summing over all filters - long nSkippedReads = 0L; - for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) - nSkippedReads += countsByFilter; - - logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", elapsed, elapsed / 60, elapsed / 3600)); - - // TODO -- move into MicroScheduler - if ( cumulativeMetrics.getNumReadsSeen() > 0 ) - logger.info(String.format("%d reads were filtered out during traversal out of %d total (%.2f%%)", - nSkippedReads, - cumulativeMetrics.getNumReadsSeen(), - 100.0 * MathUtils.ratio(nSkippedReads,cumulativeMetrics.getNumReadsSeen()))); - for ( Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { - long count = filterCounts.getValue(); - logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", - count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); - } - - if ( performanceLog != null ) performanceLog.close(); + if ( performanceLog != null ) + performanceLog.close(); } /** @@ -329,39 +307,4 @@ public class TraversalProgressMeter { final long elapsed = curTime - lastPrintTime; return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; } - - /** - * a snapshot of our performance, suitable for storage and later analysis - */ - private static class ProgressData { - final double elapsedSeconds; - final long unitsProcessed; - final long bpProcessed; - - @Requires({"unitsProcessed >= 0", "bpProcessed >= 0", "elapsedSeconds >= 0"}) - public ProgressData(double elapsedSeconds, long unitsProcessed, long bpProcessed) { - this.elapsedSeconds = elapsedSeconds; - this.unitsProcessed = unitsProcessed; - this.bpProcessed = bpProcessed; - } - - /** How long in seconds to process 1M traversal units? */ - @Ensures("result >= 0.0") - private double secondsPerMillionElements() { - return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1); - } - - /** How long in seconds to process 1M bp on the genome? */ - @Ensures("result >= 0.0") - private double secondsPerMillionBP() { - return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1); - } - - /** What fraction of the target intervals have we covered? */ - @Requires("targetSize >= 0") - @Ensures({"result >= 0.0", "result <= 1.0"}) - private double calculateFractionGenomeTargetCompleted(final long targetSize) { - return (1.0*bpProcessed) / Math.max(targetSize, 1); - } - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java new file mode 100644 index 000000000..096b55be2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java @@ -0,0 +1,54 @@ +package org.broadinstitute.sting.utils.progressmeter; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +/** + * a snapshot of our performance, suitable for storage and later analysis + */ +class ProgressMeterData { + private final double elapsedSeconds; + private final long unitsProcessed; + private final long bpProcessed; + + @Requires({"unitsProcessed >= 0", "bpProcessed >= 0", "elapsedSeconds >= 0"}) + public ProgressMeterData(double elapsedSeconds, long unitsProcessed, long bpProcessed) { + this.elapsedSeconds = elapsedSeconds; + this.unitsProcessed = unitsProcessed; + this.bpProcessed = bpProcessed; + } + + @Ensures("result >= 0.0") + public double getElapsedSeconds() { + return elapsedSeconds; + } + + @Ensures("result >= 0") + public long getUnitsProcessed() { + return unitsProcessed; + } + + @Ensures("result >= 0") + public long getBpProcessed() { + return bpProcessed; + } + + /** How long in seconds to process 1M traversal units? */ + @Ensures("result >= 0.0") + public double secondsPerMillionElements() { + return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1); + } + + /** How long in seconds to process 1M bp on the genome? */ + @Ensures("result >= 0.0") + public double secondsPerMillionBP() { + return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1); + } + + /** What fraction of the target intervals have we covered? */ + @Requires("targetSize >= 0") + @Ensures({"result >= 0.0", "result <= 1.0"}) + public double calculateFractionGenomeTargetCompleted(final long targetSize) { + return (1.0*bpProcessed) / Math.max(targetSize, 1); + } +} From f713d400e2865834980aeb36553f5c89d31aaab2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 9 Sep 2012 16:52:52 -0400 Subject: [PATCH 11/19] Fixed GSA-515 Nanoscheduler GSA-555 / Make NT and NCT work together -- Can now say -nt 4 and -nct 4 to get 16 threads running for you! -- TraversalEngines are now ThreadLocal variables in the MicroScheduler. -- Misc. code cleanup, final variables, some contracts. --- .../executive/HierarchicalMicroScheduler.java | 24 ++- .../gatk/executive/LinearMicroScheduler.java | 8 +- .../sting/gatk/executive/MicroScheduler.java | 147 ++++++++++++++---- .../sting/gatk/executive/ShardTraverser.java | 6 +- 4 files changed, 134 insertions(+), 51 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 486e83e60..1bac72f3e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -107,7 +107,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar this.traversalTasks = shardStrategy.iterator(); - ReduceTree reduceTree = new ReduceTree(this); + final ReduceTree reduceTree = new ReduceTree(this); initializeWalker(walker); while (isShardTraversePending() || isTreeReducePending()) { @@ -301,17 +301,13 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar if (!traversalTasks.hasNext()) throw new IllegalStateException("Cannot traverse; no pending traversals exist."); - Shard shard = traversalTasks.next(); + final Shard shard = traversalTasks.next(); // todo -- add ownership claim here - ShardTraverser traverser = new ShardTraverser(this, - traversalEngine, - walker, - shard, - outputTracker); + final ShardTraverser traverser = new ShardTraverser(this, walker, shard, outputTracker); - Future traverseResult = threadPool.submit(traverser); + final Future traverseResult = threadPool.submit(traverser); // Add this traverse result to the reduce tree. The reduce tree will call a callback to throw its entries on the queue. reduceTree.addEntry(traverseResult); @@ -326,7 +322,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar protected void queueNextTreeReduce( Walker walker ) { if (reduceTasks.size() == 0) throw new IllegalStateException("Cannot reduce; no pending reduces exist."); - TreeReduceTask reducer = reduceTasks.remove(); + final TreeReduceTask reducer = reduceTasks.remove(); reducer.setWalker((TreeReducible) walker); threadPool.submit(reducer); @@ -334,7 +330,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** Blocks until a free slot appears in the thread queue. */ protected void waitForFreeQueueSlot() { - ThreadPoolMonitor monitor = new ThreadPoolMonitor(); + final ThreadPoolMonitor monitor = new ThreadPoolMonitor(); synchronized (monitor) { threadPool.submit(monitor); monitor.watch(); @@ -346,8 +342,8 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar * * @return A new, composite future of the result of this reduce. */ - public Future notifyReduce( Future lhs, Future rhs ) { - TreeReduceTask reducer = new TreeReduceTask(new TreeReducer(this, lhs, rhs)); + public Future notifyReduce( final Future lhs, final Future rhs ) { + final TreeReduceTask reducer = new TreeReduceTask(new TreeReducer(this, lhs, rhs)); reduceTasks.add(reducer); return reducer; } @@ -375,7 +371,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar return this.error; } - private final RuntimeException toRuntimeException(final Throwable error) { + private RuntimeException toRuntimeException(final Throwable error) { // If the error is already a Runtime, pass it along as is. Otherwise, wrap it. if (error instanceof RuntimeException) return (RuntimeException)error; @@ -386,7 +382,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** A small wrapper class that provides the TreeReducer interface along with the FutureTask semantics. */ private class TreeReduceTask extends FutureTask { - private TreeReducer treeReducer = null; + final private TreeReducer treeReducer; public TreeReduceTask( TreeReducer treeReducer ) { super(treeReducer); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 697e908fd..60f7317ba 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -69,7 +69,7 @@ public class LinearMicroScheduler extends MicroScheduler { getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); - Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); + Object result = getTraversalEngine().traverse(walker, dataProvider, accumulator.getReduceInit()); accumulator.accumulate(dataProvider,result); dataProvider.close(); if ( walker.isDone() ) break; @@ -78,7 +78,7 @@ public class LinearMicroScheduler extends MicroScheduler { } else { ShardDataProvider dataProvider = new ReadShardDataProvider(shard,engine.getGenomeLocParser(),getReadIterator(shard),reference,rods); - Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); + Object result = getTraversalEngine().traverse(walker, dataProvider, accumulator.getReduceInit()); accumulator.accumulate(dataProvider,result); dataProvider.close(); } @@ -87,8 +87,8 @@ public class LinearMicroScheduler extends MicroScheduler { } // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine - if( traversalEngine instanceof TraverseActiveRegions ) { - final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); + if( getTraversalEngine() instanceof TraverseActiveRegions ) { + final Object result = ((TraverseActiveRegions) getTraversalEngine()).endTraversal(walker, accumulator.getReduceInit()); accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 3e843de3e..4024b247d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.executive; +import com.google.java.contract.Ensures; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -50,6 +51,8 @@ import javax.management.ObjectName; import java.io.File; import java.lang.management.ManagementFactory; import java.util.Collection; +import java.util.LinkedList; +import java.util.List; import java.util.Map; @@ -78,7 +81,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ protected final GenomeAnalysisEngine engine; - protected final TraversalEngine traversalEngine; + private final TraversalEngineCreator traversalEngineCreator; protected final IndexedFastaSequenceFile reference; private final SAMDataSource reads; @@ -110,11 +113,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { if ( threadAllocation.isRunningInParallelMode() ) { - // TODO -- remove me when we fix running NCT within HMS - if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1) - throw new UserException("Currently the GATK does not support running CPU threads within data threads, " + - "please specify only one of NT and NCT"); - logger.info(String.format("Running the GATK in parallel mode with %d CPU thread(s) for each of %d data thread(s)", threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads())); } @@ -160,30 +158,12 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { this.reads = reads; this.reference = reference; this.rods = rods; - - if (walker instanceof ReadWalker) { - traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 - ? new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()) - : new TraverseReads(); - } else if (walker instanceof LocusWalker) { - traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 - ? new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()) - : new TraverseLociLinear(); - } else if (walker instanceof DuplicateWalker) { - traversalEngine = new TraverseDuplicates(); - } else if (walker instanceof ReadPairWalker) { - traversalEngine = new TraverseReadPairs(); - } else if (walker instanceof ActiveRegionWalker) { - traversalEngine = new TraverseActiveRegions(); - } else { - throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); - } + this.traversalEngineCreator = new TraversalEngineCreator(walker, threadAllocation); final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; this.progressMeter = new ProgressMeter(progressLogFile, - traversalEngine.getTraversalUnits(), + traversalEngineCreator.getTraversalUnits(), engine.getRegionsOfGenomeBeingProcessed()); - traversalEngine.initialize(engine, progressMeter); // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. // To get around this limitation and since we have no job identifier at this point, register a simple counter that @@ -249,8 +229,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { progressMeter.notifyDone(engine.getCumulativeMetrics().getNumIterations()); printReadFilteringStats(); - // TODO -- generalize to all local thread copies - traversalEngine.shutdown(); + for ( final TraversalEngine te : traversalEngineCreator.getCreatedEngines() ) + te.shutdown(); // Print out the threading efficiency of this HMS, if state monitoring is enabled if ( threadEfficiencyMonitor != null ) { @@ -317,4 +297,115 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { throw new ReviewedStingException("Unable to unregister microscheduler with JMX", ex); } } + + /** + * Returns a traversal engine suitable for use in this thread. + * + * May create a new traversal engine for this thread, if this is the first + * time this thread ever asked for a TraversalEngine. + * + * @return a non-null TraversalEngine suitable for execution in this scheduler + */ + public TraversalEngine getTraversalEngine() { + return traversalEngineCreator.get(); + } + + /** + * ThreadLocal TraversalEngine creator + * + * TraversalEngines are thread local variables to the MicroScheduler. This is necessary + * because in the HMS case you have multiple threads executing a traversal engine independently, and + * these engines may need to create separate resources for efficiency or implementation reasons. For example, + * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. + * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have + * N data threads x M nano threads => N * M threads total. + * + * This class also tracks all created traversal engines so this microscheduler can properly + * shut them all down when the scheduling is done. + */ + private class TraversalEngineCreator extends ThreadLocal { + final List createdEngines = new LinkedList(); + final Walker walker; + final ThreadAllocation threadAllocation; + + /** + * Creates an initialized TraversalEngine appropriate for walker and threadAllocation, + * and adds it to the list of created engines for later shutdown. + * + * @return a non-null traversal engine + */ + @Override + protected synchronized TraversalEngine initialValue() { + final TraversalEngine traversalEngine = createEngine(); + traversalEngine.initialize(engine, progressMeter); + createdEngines.add(traversalEngine); + return traversalEngine; + } + + /** + * Returns the traversal units for traversal engines created here. + * + * This (unfortunately) creates an uninitialized tmp. TraversalEngine so we can get + * it's traversal units, and then immediately shuts it down... + * + * @return the traversal unit as returned by getTraversalUnits of TraversalEngines created here + */ + protected String getTraversalUnits() { + final TraversalEngine tmp = createEngine(); + final String units = tmp.getTraversalUnits(); + tmp.shutdown(); + return units; + } + + /** + * Really make us a traversal engine of the appropriate type for walker and thread allocation + * + * @return a non-null uninitialized traversal engine + */ + @Ensures("result != null") + protected TraversalEngine createEngine() { + if (walker instanceof ReadWalker) { + if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) + return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); + else + return new TraverseReads(); + } else if (walker instanceof LocusWalker) { + if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) + return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); + else + return new TraverseLociLinear(); + } else if (walker instanceof DuplicateWalker) { + return new TraverseDuplicates(); + } else if (walker instanceof ReadPairWalker) { + return new TraverseReadPairs(); + } else if (walker instanceof ActiveRegionWalker) { + return new TraverseActiveRegions(); + } else { + throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); + } + } + + /** + * Create a TraversalEngineCreator that makes TraversalEngines appropriate for walker and threadAllocation + * + * @param walker the walker we need traversal engines for + * @param threadAllocation what kind of threading will we use in the traversal? + */ + @com.google.java.contract.Requires({"walker != null", "threadAllocation != null"}) + public TraversalEngineCreator(final Walker walker, final ThreadAllocation threadAllocation) { + super(); + this.walker = walker; + this.threadAllocation = threadAllocation; + } + + /** + * Get the list of all traversal engines we've created + * + * @return a non-null list of engines created so far + */ + @Ensures("result != null") + public List getCreatedEngines() { + return createdEngines; + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 790c6b3ed..e8f15ebef 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -5,7 +5,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvide import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; -import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -29,7 +28,6 @@ public class ShardTraverser implements Callable { final private HierarchicalMicroScheduler microScheduler; final private Walker walker; final private Shard shard; - final private TraversalEngine traversalEngine; final private ThreadLocalOutputTracker outputTracker; private OutputMergeTask outputMergeTask; @@ -42,13 +40,11 @@ public class ShardTraverser implements Callable { private boolean complete = false; public ShardTraverser( HierarchicalMicroScheduler microScheduler, - TraversalEngine traversalEngine, Walker walker, Shard shard, ThreadLocalOutputTracker outputTracker) { this.microScheduler = microScheduler; this.walker = walker; - this.traversalEngine = traversalEngine; this.shard = shard; this.outputTracker = outputTracker; } @@ -65,7 +61,7 @@ public class ShardTraverser implements Callable { for(WindowMaker.WindowMakerIterator iterator: windowMaker) { final ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),microScheduler.getEngine().getGenomeLocParser(),iterator.getLocus(),iterator,microScheduler.reference,microScheduler.rods); - accumulator = traversalEngine.traverse( walker, dataProvider, accumulator ); + accumulator = microScheduler.getTraversalEngine().traverse(walker, dataProvider, accumulator); dataProvider.close(); } From 195cf6df7e8b687b097c444acd64a53ad48656e7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 10 Sep 2012 14:17:41 -0400 Subject: [PATCH 12/19] Attempting to fix out of memory errors with new traversal engine creator --- .../sting/gatk/executive/MicroScheduler.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 4024b247d..893548a9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -81,7 +81,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ protected final GenomeAnalysisEngine engine; - private final TraversalEngineCreator traversalEngineCreator; + private TraversalEngineCreator traversalEngineCreator; protected final IndexedFastaSequenceFile reference; private final SAMDataSource reads; @@ -229,8 +229,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { progressMeter.notifyDone(engine.getCumulativeMetrics().getNumIterations()); printReadFilteringStats(); - for ( final TraversalEngine te : traversalEngineCreator.getCreatedEngines() ) - te.shutdown(); + traversalEngineCreator.shutdown(); + traversalEngineCreator = null; // Print out the threading efficiency of this HMS, if state monitoring is enabled if ( threadEfficiencyMonitor != null ) { @@ -399,13 +399,13 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } /** - * Get the list of all traversal engines we've created - * - * @return a non-null list of engines created so far + * Shutdown all of the created engines, and clear the list of created engines, dropping + * pointers to the traversal engines */ - @Ensures("result != null") - public List getCreatedEngines() { - return createdEngines; + public synchronized void shutdown() { + for ( final TraversalEngine te : traversalEngineCreator.createdEngines ) + te.shutdown(); + createdEngines.clear(); } } } From 641c6a361e944b28206f0466a3e67f3137d61ed6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 10 Sep 2012 16:16:19 -0400 Subject: [PATCH 13/19] Fix nasty memory leak in new data thread x cpu thread parallelism -- Basically you cannot safely use instance specific ThreadLocal variables, as these cannot be safely cleaned up. The old implementation kept pointers to old writers, with huge tribble block indexes, and eventually we crashed out of integration tests -- See http://weblogs.java.net/blog/jjviana/archive/2010/06/10/threadlocal-thread-pool-bad-idea-or-dealing-apparent-glassfish-memor for more information -- New implementation uses a borrow/return schedule with a list of N TraversalEngines managed by the MicroScheduler directly. --- .../gatk/executive/LinearMicroScheduler.java | 11 +- .../sting/gatk/executive/MicroScheduler.java | 212 +++++++++--------- .../sting/gatk/executive/ShardTraverser.java | 5 +- 3 files changed, 118 insertions(+), 110 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 60f7317ba..09b18bfe1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.SampleUtils; @@ -60,6 +61,7 @@ public class LinearMicroScheduler extends MicroScheduler { boolean done = walker.isDone(); int counter = 0; + final TraversalEngine traversalEngine = borrowTraversalEngine(); for (Shard shard : shardStrategy ) { if ( done || shard == null ) // we ran out of shards that aren't owned break; @@ -69,7 +71,7 @@ public class LinearMicroScheduler extends MicroScheduler { getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); - Object result = getTraversalEngine().traverse(walker, dataProvider, accumulator.getReduceInit()); + Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); accumulator.accumulate(dataProvider,result); dataProvider.close(); if ( walker.isDone() ) break; @@ -78,7 +80,7 @@ public class LinearMicroScheduler extends MicroScheduler { } else { ShardDataProvider dataProvider = new ReadShardDataProvider(shard,engine.getGenomeLocParser(),getReadIterator(shard),reference,rods); - Object result = getTraversalEngine().traverse(walker, dataProvider, accumulator.getReduceInit()); + Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); accumulator.accumulate(dataProvider,result); dataProvider.close(); } @@ -87,14 +89,15 @@ public class LinearMicroScheduler extends MicroScheduler { } // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine - if( getTraversalEngine() instanceof TraverseActiveRegions ) { - final Object result = ((TraverseActiveRegions) getTraversalEngine()).endTraversal(walker, accumulator.getReduceInit()); + if( traversalEngine instanceof TraverseActiveRegions ) { + final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator } Object result = accumulator.finishTraversal(); outputTracker.close(); + returnTraversalEngine(traversalEngine); cleanup(); executionIsDone(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 893548a9b..030f8d0f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -63,14 +63,36 @@ import java.util.Map; * Time: 12:37:23 PM * * General base class for all scheduling algorithms + * Shards and schedules data in manageable chunks. + * + * Creates N TraversalEngines for each data thread for the MicroScheduler. This is necessary + * because in the HMS case you have multiple threads executing a traversal engine independently, and + * these engines may need to create separate resources for efficiency or implementation reasons. For example, + * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. + * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have + * N data threads x M nano threads => N * M threads total. These are borrowed from this microscheduler + * and returned when done. Also allows us to tracks all created traversal engines so this microscheduler + * can properly shut them all down when the scheduling is done. + * */ - -/** Shards and schedules data in manageable chunks. */ public abstract class MicroScheduler implements MicroSchedulerMBean { // TODO -- remove me and retire non nano scheduled versions of traversals private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true; protected static final Logger logger = Logger.getLogger(MicroScheduler.class); + /** + * The list of all Traversal engines we've created in this micro scheduler + */ + final List allCreatedTraversalEngines = new LinkedList(); + + /** + * All available engines. Engines are borrowed and returned when a subclass is actually + * going to execute the engine on some data. This allows us to have N copies for + * N data parallel executions, but without the dangerous code of having local + * ThreadLocal variables. + */ + final LinkedList availableTraversalEngines = new LinkedList(); + /** * Counts the number of instances of the class that are currently alive. */ @@ -81,7 +103,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ protected final GenomeAnalysisEngine engine; - private TraversalEngineCreator traversalEngineCreator; protected final IndexedFastaSequenceFile reference; private final SAMDataSource reads; @@ -158,13 +179,27 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { this.reads = reads; this.reference = reference; this.rods = rods; - this.traversalEngineCreator = new TraversalEngineCreator(walker, threadAllocation); final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; + + // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation, + // and adds it to the list of created engines for later shutdown. + for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) { + final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation); + allCreatedTraversalEngines.add(traversalEngine); + availableTraversalEngines.add(traversalEngine); + } + logger.info("Creating " + threadAllocation.getNumDataThreads() + " traversal engines"); + + // Create our progress meter this.progressMeter = new ProgressMeter(progressLogFile, - traversalEngineCreator.getTraversalUnits(), + availableTraversalEngines.peek().getTraversalUnits(), engine.getRegionsOfGenomeBeingProcessed()); + // Now that we have a progress meter, go through and initialize the traversal engines + for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) + traversalEngine.initialize(engine, progressMeter); + // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. // To get around this limitation and since we have no job identifier at this point, register a simple counter that // will count the number of instances of this object that have been created in this JVM. @@ -179,6 +214,35 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } } + /** + * Really make us a traversal engine of the appropriate type for walker and thread allocation + * + * @return a non-null uninitialized traversal engine + */ + @Ensures("result != null") + private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { + if (walker instanceof ReadWalker) { + if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) + return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); + else + return new TraverseReads(); + } else if (walker instanceof LocusWalker) { + if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) + return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); + else + return new TraverseLociLinear(); + } else if (walker instanceof DuplicateWalker) { + return new TraverseDuplicates(); + } else if (walker instanceof ReadPairWalker) { + return new TraverseReadPairs(); + } else if (walker instanceof ActiveRegionWalker) { + return new TraverseActiveRegions(); + } else { + throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); + } + } + + /** * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one * @@ -228,9 +292,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { protected void executionIsDone() { progressMeter.notifyDone(engine.getCumulativeMetrics().getNumIterations()); printReadFilteringStats(); - - traversalEngineCreator.shutdown(); - traversalEngineCreator = null; + shutdownTraversalEngines(); // Print out the threading efficiency of this HMS, if state monitoring is enabled if ( threadEfficiencyMonitor != null ) { @@ -240,6 +302,23 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } } + /** + * Shutdown all of the created engines, and clear the list of created engines, dropping + * pointers to the traversal engines + */ + public synchronized void shutdownTraversalEngines() { + if ( availableTraversalEngines.size() != allCreatedTraversalEngines.size() ) + throw new IllegalStateException("Shutting down TraversalEngineCreator but not all engines " + + "have been returned. Expected " + allCreatedTraversalEngines.size() + " but only " + availableTraversalEngines.size() + + " have been returned"); + + for ( final TraversalEngine te : allCreatedTraversalEngines) + te.shutdown(); + + allCreatedTraversalEngines.clear(); + availableTraversalEngines.clear(); + } + /** * Prints out information about number of reads observed and filtering, if any reads were used in the traversal * @@ -301,111 +380,34 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { /** * Returns a traversal engine suitable for use in this thread. * - * May create a new traversal engine for this thread, if this is the first - * time this thread ever asked for a TraversalEngine. + * Pops the next available engine from the available ones maintained by this + * microscheduler. Note that it's a runtime error to pop a traversal engine + * from this scheduler if there are none available. Callers that + * once pop'd an engine for use must return it with returnTraversalEngine * * @return a non-null TraversalEngine suitable for execution in this scheduler */ - public TraversalEngine getTraversalEngine() { - return traversalEngineCreator.get(); + @Ensures("result != null") + protected synchronized TraversalEngine borrowTraversalEngine() { + if ( availableTraversalEngines.isEmpty() ) + throw new IllegalStateException("no traversal engines were available"); + else { + return availableTraversalEngines.pop(); + } } /** - * ThreadLocal TraversalEngine creator + * Return a borrowed traversal engine to this MicroScheduler, for later use + * in another traversal execution * - * TraversalEngines are thread local variables to the MicroScheduler. This is necessary - * because in the HMS case you have multiple threads executing a traversal engine independently, and - * these engines may need to create separate resources for efficiency or implementation reasons. For example, - * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. - * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have - * N data threads x M nano threads => N * M threads total. - * - * This class also tracks all created traversal engines so this microscheduler can properly - * shut them all down when the scheduling is done. + * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. */ - private class TraversalEngineCreator extends ThreadLocal { - final List createdEngines = new LinkedList(); - final Walker walker; - final ThreadAllocation threadAllocation; + protected synchronized void returnTraversalEngine(final TraversalEngine traversalEngine) { + if ( traversalEngine == null ) + throw new IllegalArgumentException("Attempting to push a null traversal engine"); + if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) + throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); - /** - * Creates an initialized TraversalEngine appropriate for walker and threadAllocation, - * and adds it to the list of created engines for later shutdown. - * - * @return a non-null traversal engine - */ - @Override - protected synchronized TraversalEngine initialValue() { - final TraversalEngine traversalEngine = createEngine(); - traversalEngine.initialize(engine, progressMeter); - createdEngines.add(traversalEngine); - return traversalEngine; - } - - /** - * Returns the traversal units for traversal engines created here. - * - * This (unfortunately) creates an uninitialized tmp. TraversalEngine so we can get - * it's traversal units, and then immediately shuts it down... - * - * @return the traversal unit as returned by getTraversalUnits of TraversalEngines created here - */ - protected String getTraversalUnits() { - final TraversalEngine tmp = createEngine(); - final String units = tmp.getTraversalUnits(); - tmp.shutdown(); - return units; - } - - /** - * Really make us a traversal engine of the appropriate type for walker and thread allocation - * - * @return a non-null uninitialized traversal engine - */ - @Ensures("result != null") - protected TraversalEngine createEngine() { - if (walker instanceof ReadWalker) { - if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) - return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); - else - return new TraverseReads(); - } else if (walker instanceof LocusWalker) { - if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) - return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); - else - return new TraverseLociLinear(); - } else if (walker instanceof DuplicateWalker) { - return new TraverseDuplicates(); - } else if (walker instanceof ReadPairWalker) { - return new TraverseReadPairs(); - } else if (walker instanceof ActiveRegionWalker) { - return new TraverseActiveRegions(); - } else { - throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); - } - } - - /** - * Create a TraversalEngineCreator that makes TraversalEngines appropriate for walker and threadAllocation - * - * @param walker the walker we need traversal engines for - * @param threadAllocation what kind of threading will we use in the traversal? - */ - @com.google.java.contract.Requires({"walker != null", "threadAllocation != null"}) - public TraversalEngineCreator(final Walker walker, final ThreadAllocation threadAllocation) { - super(); - this.walker = walker; - this.threadAllocation = threadAllocation; - } - - /** - * Shutdown all of the created engines, and clear the list of created engines, dropping - * pointers to the traversal engines - */ - public synchronized void shutdown() { - for ( final TraversalEngine te : traversalEngineCreator.createdEngines ) - te.shutdown(); - createdEngines.clear(); - } + availableTraversalEngines.push(traversalEngine); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index e8f15ebef..d632892d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvide import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; +import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -50,6 +51,7 @@ public class ShardTraverser implements Callable { } public Object call() { + final TraversalEngine traversalEngine = microScheduler.borrowTraversalEngine(); try { final long startTime = System.currentTimeMillis(); @@ -61,7 +63,7 @@ public class ShardTraverser implements Callable { for(WindowMaker.WindowMakerIterator iterator: windowMaker) { final ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),microScheduler.getEngine().getGenomeLocParser(),iterator.getLocus(),iterator,microScheduler.reference,microScheduler.rods); - accumulator = microScheduler.getTraversalEngine().traverse(walker, dataProvider, accumulator); + accumulator = traversalEngine.traverse(walker, dataProvider, accumulator); dataProvider.close(); } @@ -79,6 +81,7 @@ public class ShardTraverser implements Callable { } finally { synchronized(this) { complete = true; + microScheduler.returnTraversalEngine(traversalEngine); notifyAll(); } } From d6e42d839cde59bc815e4a151fce912c638c15c9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 10 Sep 2012 16:39:49 -0400 Subject: [PATCH 14/19] Fixes GSA-558 GATK ReadShards don't handle unmapped reads correctly. --- .../sting/gatk/datasources/reads/ReadShard.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index fd1ee9859..def27b20d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -8,7 +8,10 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; /** * @@ -149,7 +152,12 @@ public class ReadShard extends Shard { if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); } - return parser.createGenomeLoc(contig, start, stop); + assert contig != null; + + if ( contig.equals("*") ) // all reads are unmapped + return GenomeLoc.UNMAPPED; + else + return parser.createGenomeLoc(contig, start, stop); } } } From e25e617d1a47f050247b456a709dcbff13402243 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 11 Sep 2012 07:38:34 -0400 Subject: [PATCH 16/19] Fixes GSA-515 Nanoscheduler GSA-560 / Fix display of NanoScheduler and MonitoringEfficiency -- Now prints out a single combined NanoScheduler runtime profile report across all nano schedulers in use. So now if you run with -nt 4 you'll get one combined NanoScheduler profiler across all 4 instances of the NanoScheduler within TraverseXNano. --- .../sting/gatk/executive/MicroScheduler.java | 4 + .../sting/utils/SimpleTimer.java | 9 ++ .../utils/nanoScheduler/InputProducer.java | 16 ++- .../utils/nanoScheduler/NSRuntimeProfile.java | 69 +++++++++++ .../utils/nanoScheduler/NanoScheduler.java | 112 +++++++++--------- .../utils/nanoScheduler/ReducerThread.java | 5 +- .../nanoScheduler/InputProducerUnitTest.java | 3 +- .../nanoScheduler/NanoSchedulerUnitTest.java | 15 ++- .../nanoScheduler/ReducerThreadUnitTest.java | 3 +- 9 files changed, 170 insertions(+), 66 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 030f8d0f2..a78ab4375 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; @@ -315,6 +316,9 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { for ( final TraversalEngine te : allCreatedTraversalEngines) te.shutdown(); + // horrible hack to print nano scheduling information across all nano schedulers, if any were used + NanoScheduler.printCombinedRuntimeProfile(); + allCreatedTraversalEngines.clear(); availableTraversalEngines.clear(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java index b3a9986c5..4c54d4126 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java +++ b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java @@ -145,4 +145,13 @@ public class SimpleTimer { public synchronized long getElapsedTimeNano() { return running ? (currentTimeNano() - startTimeNano + elapsedTimeNano) : elapsedTimeNano; } + + /** + * Add the elapsed time from toAdd to this elapsed time + * + * @param toAdd the timer whose elapsed time we want to add to this timer + */ + public synchronized void addElapsed(final SimpleTimer toAdd) { + elapsedTimeNano += toAdd.getElapsedTimeNano(); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java index 29dddbc49..f5eb53456 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -29,6 +29,7 @@ class InputProducer implements Runnable { final SimpleTimer inputTimer, final BlockingQueue outputQueue) { if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); + if ( inputTimer == null ) throw new IllegalArgumentException("inputTimer cannot be null"); if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null"); this.inputReader = inputReader; @@ -38,11 +39,16 @@ class InputProducer implements Runnable { public void run() { try { - while ( inputReader.hasNext() ) { - if ( inputTimer != null ) inputTimer.restart(); - final InputType input = inputReader.next(); - if ( inputTimer != null ) inputTimer.stop(); - outputQueue.put(new InputValue(input)); + while ( true ) { + inputTimer.restart(); + if ( ! inputReader.hasNext() ) { + inputTimer.stop(); + break; + } else { + final InputType input = inputReader.next(); + inputTimer.stop(); + outputQueue.put(new InputValue(input)); + } } // add the EOF object so our consumer knows we are done in all inputs diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java new file mode 100644 index 000000000..874434eae --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java @@ -0,0 +1,69 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.broadinstitute.sting.utils.SimpleTimer; + +/** + * Holds runtime profile (input, read, map) times as tracked by NanoScheduler + * + * User: depristo + * Date: 9/10/12 + * Time: 8:31 PM + */ +public class NSRuntimeProfile { + final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); + final SimpleTimer inputTimer = new SimpleTimer("input"); + final SimpleTimer mapTimer = new SimpleTimer("map"); + final SimpleTimer reduceTimer = new SimpleTimer("reduce"); + + /** + * Combine the elapsed time information from other with this profile + * + * @param other a non-null profile + */ + public void combine(final NSRuntimeProfile other) { + outsideSchedulerTimer.addElapsed(other.outsideSchedulerTimer); + inputTimer.addElapsed(other.inputTimer); + mapTimer.addElapsed(other.mapTimer); + reduceTimer.addElapsed(other.reduceTimer); + } + + /** + * Print the runtime profiling to logger + * + * @param logger + */ + public void log(final Logger logger) { + log1(logger, "Input time", inputTimer); + log1(logger, "Map time", mapTimer); + log1(logger, "Reduce time", reduceTimer); + log1(logger, "Outside time", outsideSchedulerTimer); + } + + /** + * @return the total runtime for all functions of this nano scheduler + */ + @Ensures("result >= 0.0") + public double totalRuntimeInSeconds() { + return inputTimer.getElapsedTime() + + mapTimer.getElapsedTime() + + reduceTimer.getElapsedTime() + + outsideSchedulerTimer.getElapsedTime(); + } + + /** + * Print to logger.info timing information from timer, with name label + * + * @param label the name of the timer to display. Should be human readable + * @param timer the timer whose elapsed time we will display + */ + @Requires({"label != null", "timer != null"}) + private void log1(final Logger logger, final String label, final SimpleTimer timer) { + final double myTimeInSec = timer.getElapsedTime(); + final double myTimePercent = myTimeInSec / totalRuntimeInSeconds() * 100; + logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 664fb7b9b..bb9afa879 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -3,8 +3,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.AutoFormattingTime; -import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.threading.NamedThreadFactory; @@ -46,7 +44,6 @@ public class NanoScheduler { private final static Logger logger = Logger.getLogger(NanoScheduler.class); private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; private final static boolean LOG_MAP_TIMES = false; - private final static boolean TIME_CALLS = true; private final static int MAP_BUFFER_SIZE_SCALE_FACTOR = 100; @@ -61,10 +58,15 @@ public class NanoScheduler { boolean debug = false; private NSProgressFunction progressFunction = null; - final SimpleTimer outsideSchedulerTimer = TIME_CALLS ? new SimpleTimer("outside") : null; - final SimpleTimer inputTimer = TIME_CALLS ? new SimpleTimer("input") : null; - final SimpleTimer mapTimer = TIME_CALLS ? new SimpleTimer("map") : null; - final SimpleTimer reduceTimer = TIME_CALLS ? new SimpleTimer("reduce") : null; + /** + * Tracks the combined runtime profiles across all created nano schedulers + */ + final static private NSRuntimeProfile combinedNSRuntimeProfiler = new NSRuntimeProfile(); + + /** + * The profile specific to this nano scheduler + */ + final private NSRuntimeProfile myNSRuntimeProfile = new NSRuntimeProfile(); /** * Create a new nanoscheduler with the desire characteristics requested by the argument @@ -92,7 +94,7 @@ public class NanoScheduler { } // start timing the time spent outside of the nanoScheduler - outsideSchedulerTimer.start(); + myNSRuntimeProfile.outsideSchedulerTimer.start(); } /** @@ -119,21 +121,31 @@ public class NanoScheduler { * After this call, execute cannot be invoked without throwing an error */ public void shutdown() { - outsideSchedulerTimer.stop(); + myNSRuntimeProfile.outsideSchedulerTimer.stop(); + + // add my timing information to the combined NS runtime profile + combinedNSRuntimeProfiler.combine(myNSRuntimeProfile); if ( nThreads > 1 ) { shutdownExecutor("inputExecutor", inputExecutor); shutdownExecutor("mapExecutor", mapExecutor); shutdownExecutor("reduceExecutor", reduceExecutor); } - shutdown = true; - if (TIME_CALLS) { - printTimerInfo("Input time", inputTimer); - printTimerInfo("Map time", mapTimer); - printTimerInfo("Reduce time", reduceTimer); - printTimerInfo("Outside time", outsideSchedulerTimer); - } + shutdown = true; + } + + public void printRuntimeProfile() { + myNSRuntimeProfile.log(logger); + } + + public static void printCombinedRuntimeProfile() { + if ( combinedNSRuntimeProfiler.totalRuntimeInSeconds() > 0.1 ) + combinedNSRuntimeProfiler.log(logger); + } + + protected double getTotalRuntime() { + return myNSRuntimeProfile.totalRuntimeInSeconds(); } /** @@ -154,21 +166,6 @@ public class NanoScheduler { throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); } - /** - * Print to logger.info timing information from timer, with name label - * - * @param label the name of the timer to display. Should be human readable - * @param timer the timer whose elapsed time we will display - */ - @Requires({"label != null", "timer != null"}) - private void printTimerInfo(final String label, final SimpleTimer timer) { - final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() - + reduceTimer.getElapsedTime() + outsideSchedulerTimer.getElapsedTime(); - final double myTimeInSec = timer.getElapsedTime(); - final double myTimePercent = myTimeInSec / total * 100; - logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); - } - /** * @return true if this nanoScheduler is shutdown, or false if its still open for business */ @@ -246,7 +243,7 @@ public class NanoScheduler { if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); - outsideSchedulerTimer.stop(); + myNSRuntimeProfile.outsideSchedulerTimer.stop(); ReduceType result; if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { @@ -255,7 +252,7 @@ public class NanoScheduler { result = executeMultiThreaded(inputReader, map, initialValue, reduce); } - outsideSchedulerTimer.restart(); + myNSRuntimeProfile.outsideSchedulerTimer.restart(); return result; } @@ -272,28 +269,31 @@ public class NanoScheduler { ReduceType sum = initialValue; int i = 0; - // start timer to ensure that both hasNext and next are caught by the timer - if ( TIME_CALLS ) inputTimer.restart(); - while ( inputReader.hasNext() ) { - final InputType input = inputReader.next(); - if ( TIME_CALLS ) inputTimer.stop(); + while ( true ) { + // start timer to ensure that both hasNext and next are caught by the timer + myNSRuntimeProfile.inputTimer.restart(); + if ( ! inputReader.hasNext() ) { + myNSRuntimeProfile.inputTimer.stop(); + break; + } else { + final InputType input = inputReader.next(); + myNSRuntimeProfile.inputTimer.stop(); - // map - if ( TIME_CALLS ) mapTimer.restart(); - final long preMapTime = LOG_MAP_TIMES ? 0 : mapTimer.currentTimeNano(); - final MapType mapValue = map.apply(input); - if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (mapTimer.currentTimeNano() - preMapTime)); - if ( TIME_CALLS ) mapTimer.stop(); + // map + myNSRuntimeProfile.mapTimer.restart(); + final long preMapTime = LOG_MAP_TIMES ? 0 : myNSRuntimeProfile.mapTimer.currentTimeNano(); + final MapType mapValue = map.apply(input); + if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (myNSRuntimeProfile.mapTimer.currentTimeNano() - preMapTime)); + myNSRuntimeProfile.mapTimer.stop(); - if ( i++ % inputBufferSize == 0 && progressFunction != null ) - progressFunction.progress(input); + if ( i++ % inputBufferSize == 0 && progressFunction != null ) + progressFunction.progress(input); - // reduce - if ( TIME_CALLS ) reduceTimer.restart(); - sum = reduce.apply(mapValue, sum); - if ( TIME_CALLS ) reduceTimer.stop(); - - if ( TIME_CALLS ) inputTimer.restart(); + // reduce + myNSRuntimeProfile.reduceTimer.restart(); + sum = reduce.apply(mapValue, sum); + myNSRuntimeProfile.reduceTimer.stop(); + } } return sum; @@ -321,11 +321,11 @@ public class NanoScheduler { new LinkedBlockingDeque>>(mapBufferSize); // Start running the input reader thread - inputExecutor.submit(new InputProducer(inputReader, inputTimer, inputQueue)); + inputExecutor.submit(new InputProducer(inputReader, myNSRuntimeProfile.inputTimer, inputQueue)); // Start running the reducer thread final ReducerThread reducer - = new ReducerThread(reduce, reduceTimer, initialValue, mapResultQueue); + = new ReducerThread(reduce, myNSRuntimeProfile.reduceTimer, initialValue, mapResultQueue); final Future reduceResult = reduceExecutor.submit(reducer); try { @@ -382,10 +382,10 @@ public class NanoScheduler { @Override public MapResult call() { - if ( TIME_CALLS ) mapTimer.restart(); if ( debug ) debugPrint("\t\tmap " + input); + myNSRuntimeProfile.mapTimer.restart(); final MapType result = map.apply(input); - if ( TIME_CALLS ) mapTimer.stop(); + myNSRuntimeProfile.mapTimer.stop(); return new MapResult(result, id); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java index 506e45453..dcdba3490 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java @@ -29,6 +29,7 @@ class ReducerThread implements Callable { final ReduceType sum, final BlockingQueue>> mapResultQueue) { if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null"); + if ( reduceTimer == null ) throw new IllegalArgumentException("reduceTimer cannot be null"); if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null"); this.reduce = reduce; @@ -51,9 +52,9 @@ class ReducerThread implements Callable { } else { lastJobID = result.getJobID(); // apply reduce, keeping track of sum - if ( reduceTimer != null ) reduceTimer.restart(); + reduceTimer.restart(); sum = reduce.apply(result.getValue(), sum); - if ( reduceTimer != null ) reduceTimer.stop(); + reduceTimer.stop(); } } } catch (ExecutionException ex) { diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java index b3365c13c..b3986e74e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.SimpleTimer; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -42,7 +43,7 @@ public class InputProducerUnitTest extends BaseTest { final LinkedBlockingDeque.InputValue> readQueue = new LinkedBlockingDeque.InputValue>(queueSize); - final InputProducer ip = new InputProducer(elements.iterator(), null, readQueue); + final InputProducer ip = new InputProducer(elements.iterator(), new SimpleTimer(), readQueue); final ExecutorService es = Executors.newSingleThreadExecutor(); es.submit(ip); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 47dcc1d5e..a0ab493c1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; import org.apache.log4j.BasicConfigurator; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.SimpleTimer; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -86,7 +87,7 @@ public class NanoSchedulerUnitTest extends BaseTest { static NanoSchedulerBasicTest exampleTest = null; @DataProvider(name = "NanoSchedulerBasicTest") public Object[][] createNanoSchedulerBasicTest() { - for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000) ) { + for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000, 10000000) ) { for ( final int nt : Arrays.asList(1, 2, 4) ) { for ( final int start : Arrays.asList(0) ) { for ( final int end : Arrays.asList(0, 1, 2, 11, 10000, 100000) ) { @@ -114,6 +115,7 @@ public class NanoSchedulerUnitTest extends BaseTest { } private void testNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { + final SimpleTimer timer = new SimpleTimer().start(); final NanoScheduler nanoScheduler = new NanoScheduler(test.bufferSize, test.nThreads); @@ -129,6 +131,17 @@ public class NanoSchedulerUnitTest extends BaseTest { Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected. Expected at least " + test.nExpectedCallbacks() + " but saw only " + callback.callBacks); nanoScheduler.shutdown(); + + // TODO -- need to enable only in the case where there's serious time spend in + // TODO -- read /map / reduce, otherwise the "outside" timer doesn't add up + final double myTimeEstimate = timer.getElapsedTime(); + final double tolerance = 0.1; + if ( false && myTimeEstimate > 0.1 ) { + Assert.assertTrue(nanoScheduler.getTotalRuntime() > myTimeEstimate * tolerance, + "NanoScheduler said that the total runtime was " + nanoScheduler.getTotalRuntime() + + " but the overall test time was " + myTimeEstimate + ", beyond our tolerance factor of " + + tolerance); + } } @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java index 61d1330bc..08771e9ec 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.SimpleTimer; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -61,7 +62,7 @@ public class ReducerThreadUnitTest extends BaseTest { final ReduceSumTest reduce = new ReduceSumTest(mapResultsQueue); final ReducerThread thread - = new ReducerThread(reduce, null, 0, mapResultsQueue); + = new ReducerThread(reduce, new SimpleTimer(), 0, mapResultsQueue); final ExecutorService es = Executors.newSingleThreadExecutor(); final Future value = es.submit(thread); From 6fad0f25bb88fc201c7575ecf5a94f588eab33f7 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 11 Sep 2012 10:34:14 -0400 Subject: [PATCH 17/19] Merge Eric's LocusIteratorByStateUnitTest changes into LocusIteratorByStateExperimentalUnitTest --- ...usIteratorByStateExperimentalUnitTest.java | 269 ++++++++++++------ 1 file changed, 185 insertions(+), 84 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java index c148bcf84..9d592cd26 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.gatk.iterators; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; @@ -39,57 +37,10 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } - private final LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { + private LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); } - private static ReadProperties createTestReadProperties() { - return createTestReadProperties(null); - } - - private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { - return new ReadProperties( - Collections.emptyList(), - new SAMFileHeader(), - false, - SAMFileReader.ValidationStringency.STRICT, - downsamplingMethod, - new ValidationExclusion(), - Collections.emptyList(), - Collections.emptyList(), - false, - (byte) -1 - ); - } - - private static class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; - - public FakeCloseableIterator(Iterator it) { - iterator = it; - } - - @Override - public void close() { - return; - } - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public T next() { - return iterator.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } - } - @Test public void testXandEQOperators() { final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; @@ -308,45 +259,36 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { // comprehensive LIBS/PileupElement tests // //////////////////////////////////////////// - private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; - private static final int IS_BEFORE_DELETION_START_FLAG = 2; - private static final int IS_AFTER_DELETED_BASE_FLAG = 4; - private static final int IS_AFTER_DELETION_END_FLAG = 8; - private static final int IS_BEFORE_INSERTION_FLAG = 16; - private static final int IS_AFTER_INSERTION_FLAG = 32; - private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; - private static class LIBSTest { final String cigar; final int readLength; - final List offsets; - final List flags; - private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { + private LIBSTest(final String cigar, final int readLength) { this.cigar = cigar; this.readLength = readLength; - this.offsets = offsets; - this.flags = flags; } } @DataProvider(name = "LIBSTest") public Object[][] createLIBSTestData() { + + //TODO -- when LIBS is fixed this should be replaced to provide all possible permutations of CIGAR strings + return new Object[][]{ - {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, - {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, - {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, - {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("1I", 1)}, + {new LIBSTest("10I", 10)}, + {new LIBSTest("2M2I2M", 6)}, + {new LIBSTest("2M2I", 4)}, //TODO -- uncomment these when LIBS is fixed //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, - {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, - {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, - {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, - {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} + //{new LIBSTest("1M2D2M", 3)}, + {new LIBSTest("1S1M", 2)}, + {new LIBSTest("1M1S", 2)}, + {new LIBSTest("1S1M1I", 3)} }; } @@ -361,26 +303,24 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { // create the iterator by state with the fake reads and fake records li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + final LIBS_position tester = new LIBS_position(read); - int offset = 0; while ( li.hasNext() ) { AlignmentContext alignmentContext = li.next(); ReadBackedPileup p = alignmentContext.getBasePileup(); Assert.assertTrue(p.getNumberOfElements() == 1); PileupElement pe = p.iterator().next(); - final int flag = params.flags.get(offset); - Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); - Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); - Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); - Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); - Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); - Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); - Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); + tester.stepForwardOnGenome(); - Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); - - offset++; + Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase); + Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart); + Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase); + Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd); + Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion); + Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion); + Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip); + Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset()); } } @@ -543,4 +483,165 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { test.run(); } + + /////////////////////////////////////// + // End Read State Manager Tests // + /////////////////////////////////////// + + + + /////////////////////////////////////// + // Helper methods / classes // + /////////////////////////////////////// + + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { + return new ReadProperties( + Collections.emptyList(), + new SAMFileHeader(), + false, + SAMFileReader.ValidationStringency.STRICT, + downsamplingMethod, + new ValidationExclusion(), + Collections.emptyList(), + Collections.emptyList(), + false, + (byte) -1 + ); + } + + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; + + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() {} + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } + } + + private static final class LIBS_position { + + SAMRecord read; + + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; + + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + + boolean sawMop = false; + + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } + + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } + + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) + return false; + + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; + + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; + break; + + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } + + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; + } + + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } + + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + } + } } From 13831106d570ce029fbc04ebab7aeb06a6ed1557 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 11 Sep 2012 11:01:26 -0400 Subject: [PATCH 18/19] Fix GSA-535: storing likelihoods in allele map was busted when running HaplotypeCaller, only the last likelihood of a haplotype was being stored, as opposed to the max likelihood of all haplotypes mapping to an allele --- .../LikelihoodCalculationEngine.java | 9 +++++---- .../HaplotypeCallerIntegrationTest.java | 14 +++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 69af66185..db289ecab 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -335,14 +335,15 @@ public class LikelihoodCalculationEngine { final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same! // only count the read if it overlaps the event, otherwise it is not added to the output read list at all if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - final double likelihoods[] = new double[call.getFirst().getAlleles().size()]; - int count = 0; - for( final Allele a : call.getFirst().getAlleles() ) { + double maxLikelihood = Double.NEGATIVE_INFINITY; for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object) final double likelihood = h.getReadLikelihoods(sample.getKey())[iii]; - likelihoodMap.add(read, a, likelihood); + if( likelihood > maxLikelihood ) { + maxLikelihood = likelihood; + } } + likelihoodMap.add(read, a, maxLikelihood); } } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index b5359af46..b45c027a7 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,17 +21,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "e5b4a0627a1d69b9356f8a7cd2260e89"); + HCTest(CEUTRIO_BAM, "", "5b751474ad0aef4cdb53f094e605f97c"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "202d5b6edaf74f411c170099749f202f"); + HCTest(NA12878_BAM, "", "60efcd2d2722087e900f6365985d18bf"); } @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "561931ba3919808ec471e745cb3148c7"); + HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(CEUTRIO_BAM, "", "3424b398a9f47c8ac606a5c56eb7d8a7"); + HCTestComplexVariants(CEUTRIO_BAM, "", "f5a809e3fbd9998f79b75bb2973209e1"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "b71cfaea9390136c584c9671b149d573"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "8043b0451a4384e678a93600b34afce7"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -64,13 +64,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e1f88fac91424740c0eaac1de48b3970"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "ea6539e05faf10ffaf76f2d16907c47a"); } @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("000fd36d5cf8090386bb2ac15e3ab0b5")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8d092b25f40456e618eef91fdce8adf0")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } }