diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index a61614481..94f6ff649 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -56,11 +56,14 @@ public class AlleleBiasedDownsamplingUtils { for ( int i = 0; i < 4; i++ ) alleleStratifiedElements[i] = new ArrayList(); + // keep all of the reduced reads + final ArrayList reducedReadPileups = new ArrayList(); + // start by stratifying the reads by the alleles they represent at this position for( final PileupElement pe : pileup ) { - // abort if we have a reduced read - we do not want to remove it! + // we do not want to remove a reduced read if ( pe.getRead().isReducedRead() ) - return pileup; + reducedReadPileups.add(pe); final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); if ( baseIndex != -1 ) @@ -76,6 +79,7 @@ public class AlleleBiasedDownsamplingUtils { return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); } }); + elementsToKeep.addAll(reducedReadPileups); // make a listing of allele counts final int[] alleleCounts = new int[4]; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 629a27c48..2061c5364 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -85,7 +85,7 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.INTERVAL) +@PartitionBy(PartitionType.CONTIG) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) public class ReduceReads extends ReadWalker, ReduceReadsStash> { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java index e9ed6b153..0a3512aa6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java @@ -240,8 +240,8 @@ public class AFCalcPerformanceTest { if ( a.isNonReference() ) { final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : ""; logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE); - final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; - logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost); + final String warningmePost = call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) == 0 && result.getLog10PosteriorOfAFEq0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; + logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFEq0ForAllele(a) + warningmePost); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index fee6c86f8..4d81d0010 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -33,7 +33,6 @@ import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -449,10 +448,10 @@ public class GenotypingEngine { final ArrayList undeterminedHaplotypes = new ArrayList(haplotypes.size()); for( final Haplotype h : haplotypes ) { - if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype - alleleMapper.get(refAllele).add(h); - } else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { + if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { alleleMapper.get(h.getArtificialAllele()).add(h); + } else if( h.getEventMap().get(loc) == null ) { // no event at this location so let's investigate later + undeterminedHaplotypes.add(h); } else { boolean haplotypeIsDetermined = false; for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index 4f072d720..3c5a1f79c 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -282,10 +282,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final VariantContext compVC : activeAllelesToGenotype ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); - if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { - return returnHaplotypes; - //throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype); - } + addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true ); } } @@ -293,7 +290,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); - if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { + if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ) ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present if( !activeAllelesToGenotype.isEmpty() ) { @@ -308,7 +305,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // falls into the bin for the 1bp deletion because we keep track of the artificial alleles). if( vcOnHaplotype == null ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ); + addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ); } } } @@ -332,7 +329,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return returnHaplotypes; } - private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList haplotypeList, final int activeRegionStart, final int activeRegionStop ) { + private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { if( haplotype == null ) { return false; } final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); @@ -387,7 +384,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return false; } - if( !haplotypeList.contains(h) ) { + if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) { haplotypeList.add(h); return true; } else { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 73bc8fba6..f26194e00 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -80,11 +80,11 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","dd568dc30be90135a3a8957a45a7321c"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "bf793c43b635a931207170be8035b288"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9e9c7e37e..c768f95ad 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("d20c7a143b899f0239bf64b652ad3edb")); + Arrays.asList("97df6c2a8d390d43b9bdf56c979d9b09")); executeTest("test Multiple SNP alleles", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("fb204e821a24d03bd3a671b6e01c449a")); + Arrays.asList("935ee705ffe8cc6bf1d9efcceea271c8")); executeTest("test mismatched PLs", spec); } @@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "5b8f477c287770b5769b05591e35bc2d"; + private final static String COMPRESSED_OUTPUT_MD5 = "3eba6c309514d1e9ee06a20a112b68e6"; @Test public void testCompressedOutput() { @@ -436,8 +436,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - Arrays.asList("32f18ba50406cd8c8069ba07f2f89558")); + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, + Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6")); executeTest("test calling on reads with Ns in CIGAR", spec); } @@ -451,13 +451,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("c1077662411164182c5f75478344f83d")); + Arrays.asList("092e42a712afb660ec79ff11c55933e2")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "dee6590e3b7079890bc3a9cb372c297e"); + testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f"); } @Test diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index 96e055e92..ee5436264 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -83,8 +83,8 @@ public class AFCalcResultUnitTest extends BaseTest { List tests = new ArrayList(); final List pValues = new LinkedList(); - for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999) ) - for ( final double espilon : Arrays.asList(-1e-5, 0.0, 1e-5) ) + for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999, 1 - 1e-4, 1 - 1e-5, 1 - 1e-6) ) + for ( final double espilon : Arrays.asList(-1e-7, 0.0, 1e-7) ) pValues.add(p + espilon); for ( final double pNonRef : pValues ) { @@ -106,16 +106,16 @@ public class AFCalcResultUnitTest extends BaseTest { alleles, MathUtils.normalizeFromLog10(new double[]{1 - pNonRef, pNonRef}, true, false), log10Even, - Collections.singletonMap(C, Math.log10(pNonRef))); + Collections.singletonMap(C, Math.log10(1 - pNonRef))); } @Test(enabled = true, dataProvider = "TestIsPolymorphic") private void testIsPolymorphic(final double pNonRef, final double pThreshold, final boolean shouldBePoly) { - final AFCalcResult result = makePolymorphicTestData(pNonRef); - final boolean actualIsPoly = result.isPolymorphic(C, Math.log10(pThreshold)); - Assert.assertEquals(actualIsPoly, shouldBePoly, - "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " - + actualIsPoly + " but the expected result is " + shouldBePoly); + final AFCalcResult result = makePolymorphicTestData(pNonRef); + final boolean actualIsPoly = result.isPolymorphic(C, Math.log10(1 - pThreshold)); + Assert.assertEquals(actualIsPoly, shouldBePoly, + "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " + + actualIsPoly + " but the expected result is " + shouldBePoly); } @Test(enabled = true, dataProvider = "TestIsPolymorphic") diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 2d346e548..7ee909fe0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -681,7 +681,7 @@ public class AFCalcUnitTest extends BaseTest { // must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt)); - Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt)); + Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFEq0ForAllele(alt)); } } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java index 391c99990..663471106 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -148,7 +148,7 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { for ( int i = 0; i < log10LAlleles.size(); i++ ) { final double log10LAllele1 = log10LAlleles.get(i); final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true); - final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0)); + final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, -10000.0)); originalPriors.add(result1); pNonRefN.add(log10pNonRef*(i+1)); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index e9c1ec605..a80137c27 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -33,7 +33,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "e2b3bf420c45c677956a2e4a56d75ea2"); + "fe84caa79f59ecbd98fcbcd5b30ab164"); } private void HCTestComplexVariants(String bam, String args, String md5) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b7000e0ee..1187039bb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -445,13 +445,17 @@ public class GenomeAnalysisEngine { protected DownsamplingMethod getDownsamplingMethod() { GATKArgumentCollection argCollection = this.getArguments(); - boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling; + + // Legacy downsampler can only be selected via the command line, not via walker annotations + boolean useLegacyDownsampler = argCollection.useLegacyDownsampler; DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); - DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling); - DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useLegacyDownsampler); + DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useLegacyDownsampler); - return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); + DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); + method.checkCompatibilityWithWalker(walker); + return method; } protected void setDownsamplingMethod(DownsamplingMethod method) { @@ -580,9 +584,9 @@ public class GenomeAnalysisEngine { throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); } - // Use the experimental ReadShardBalancer if experimental downsampling is enabled - ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ? - new ExperimentalReadShardBalancer() : + // Use the legacy ReadShardBalancer if legacy downsampling is enabled + ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useLegacyDownsampler ? + new LegacyReadShardBalancer() : new ReadShardBalancer(); if(intervals == null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index 28b5f918d..1a9162e51 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -305,11 +305,23 @@ public class WalkerManager extends PluginManager { * Gets the type of downsampling method requested by the walker. If an alternative * downsampling method is specified on the command-line, the command-line version will * be used instead. - * @param walkerClass The class of the walker to interrogate. - * @param useExperimentalDownsampling If true, use the experimental downsampling implementation + * @param walker The walker to interrogate. + * @param useLegacyDownsampler If true, use the legacy downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useExperimentalDownsampling) { + public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useLegacyDownsampler) { + return getDownsamplingMethod(walker.getClass(), useLegacyDownsampler); + } + + /** + * Gets the type of downsampling method requested by the walker. If an alternative + * downsampling method is specified on the command-line, the command-line version will + * be used instead. + * @param walkerClass The class of the walker to interrogate. + * @param useLegacyDownsampler If true, use the legacy downsampling implementation + * @return The downsampling method, as specified by the walker. Null if none exists. + */ + public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useLegacyDownsampler) { DownsamplingMethod downsamplingMethod = null; if( walkerClass.isAnnotationPresent(Downsample.class) ) { @@ -317,7 +329,7 @@ public class WalkerManager extends PluginManager { DownsampleType type = downsampleParameters.by(); Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; - downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling); + downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useLegacyDownsampler); } return downsamplingMethod; @@ -331,18 +343,6 @@ public class WalkerManager extends PluginManager { return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); } - /** - * Gets the type of downsampling method requested by the walker. If an alternative - * downsampling method is specified on the command-line, the command-line version will - * be used instead. - * @param walker The walker to interrogate. - * @param useExperimentalDownsampling If true, use the experimental downsampling implementation - * @return The downsampling method, as specified by the walker. Null if none exists. - */ - public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) { - return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling); - } - /** * Create a name for this type of walker. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index d0f3e91e0..d9c7c9008 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -162,12 +162,11 @@ public class GATKArgumentCollection { @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; - @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false) + @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.", required = false) public Integer downsampleCoverage = null; - @Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false) - @Hidden - public boolean enableExperimentalDownsampling = false; + @Argument(fullName = "use_legacy_downsampler", shortName = "use_legacy_downsampler", doc = "Use the legacy downsampling implementation instead of the newer, less-tested implementation", required = false) + public boolean useLegacyDownsampler = false; /** * Gets the downsampling method explicitly specified by the user. If the user didn't specify @@ -178,7 +177,7 @@ public class GATKArgumentCollection { if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) return null; - return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling); + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, useLegacyDownsampler); } /** @@ -192,7 +191,7 @@ public class GATKArgumentCollection { downsamplingType = method.type; downsampleCoverage = method.toCoverage; downsampleFraction = method.toFraction; - enableExperimentalDownsampling = method.useExperimentalDownsampling; + useLegacyDownsampler = method.useLegacyDownsampler; } // -------------------------------------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index cd3403f2f..c12bce208 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -136,11 +136,12 @@ public abstract class LocusView extends LocusIterator implements View { // Cache the current and apply filtering. AlignmentContext current = nextLocus; - // The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling: - if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling && - sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) { + // The old ALL_READS downsampling implementation -- use only if legacy downsampling was requested: + if ( sourceInfo.getDownsamplingMethod().useLegacyDownsampler && + sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && + sourceInfo.getDownsamplingMethod().toCoverage != null ) { - current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); + current.downsampleToCoverage(sourceInfo.getDownsamplingMethod().toCoverage); } // Indicate that the next operation will need to advance. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 8ee7e0439..cb33c5ab8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -134,12 +134,11 @@ public class BAMScheduler implements Iterator { // Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling // TODO: clean this up once the experimental downsampling engine fork collapses - if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) { - currentPosition = dataSource.getInitialReaderPositions(); + if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useLegacyDownsampler ) { + currentPosition = dataSource.getCurrentPosition(); } else { - currentPosition = dataSource.getCurrentPosition(); - + currentPosition = dataSource.getInitialReaderPositions(); } for(SAMReaderID reader: dataSource.getReaderIDs()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java deleted file mode 100644 index 0440c7eae..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards. - * - * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig - * together into one monolithic FilePointer, create one persistent set of read iterators over - * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to - * fill read shards with reads. - * - * This strategy has several important advantages: - * - * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole - * contig will have regions that overlap with other FilePointers on the same contig, due - * to the limited granularity of BAM index data. By creating only one FilePointer per contig, - * we avoid having to track how much of each file region we've visited (as we did in the - * former implementation), we avoid expensive non-sequential access patterns in the files, - * and we avoid having to repeatedly re-create our iterator chain for every small region - * of interest. - * - * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single - * persistent set of read iterators (which include the downsampling iterator(s)) per contig, - * the downsampling process is never interrupted by FilePointer or Shard boundaries, and never - * loses crucial state information while downsampling within a contig. - * - * TODO: There is also at least one important disadvantage: - * - * 1. We load more BAM index data into memory at once, and this work is done upfront before processing - * the next contig, creating a delay before traversal of each contig. This delay may be - * compensated for by the gains listed in #1 above, and we may be no worse off overall in - * terms of total runtime, but we need to verify this empirically. - * - * @author David Roazen - */ -public class ExperimentalReadShardBalancer extends ShardBalancer { - - private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class); - - /** - * Convert iterators of file pointers into balanced iterators of shards. - * @return An iterator over balanced shards. - */ - public Iterator iterator() { - return new Iterator() { - /** - * The cached shard to be returned next. Prefetched in the peekable iterator style. - */ - private Shard nextShard = null; - - /** - * The file pointer currently being processed. - */ - private FilePointer currentContigFilePointer = null; - - /** - * Iterator over the reads from the current contig's file pointer. The same iterator will be - * used to fill all shards associated with a given file pointer - */ - private PeekableIterator currentContigReadsIterator = null; - - /** - * How many FilePointers have we pulled from the filePointers iterator? - */ - private int totalFilePointersConsumed = 0; - - /** - * Have we encountered a monolithic FilePointer? - */ - private boolean encounteredMonolithicFilePointer = false; - - - { - createNextContigFilePointer(); - advance(); - } - - public boolean hasNext() { - return nextShard != null; - } - - public Shard next() { - if ( ! hasNext() ) - throw new NoSuchElementException("No next read shard available"); - Shard currentShard = nextShard; - advance(); - return currentShard; - } - - private void advance() { - nextShard = null; - - // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away - while ( nextShard == null && currentContigFilePointer != null ) { - - // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): - if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) { - - // Close the old, exhausted chain of iterators to release resources - currentContigReadsIterator.close(); - - // Advance to the FilePointer for the next contig - createNextContigFilePointer(); - - // We'll need to create a fresh iterator for this file pointer when we create the first - // shard for it below. - currentContigReadsIterator = null; - } - - // At this point our currentContigReadsIterator may be null or non-null depending on whether or not - // this is our first shard for this file pointer. - if ( currentContigFilePointer != null ) { - Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped); - - // Create a new reads iterator only when we've just advanced to the file pointer for the next - // contig. It's essential that the iterators persist across all shards that share the same contig - // to allow the downsampling to work properly. - if ( currentContigReadsIterator == null ) { - currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); - } - - if ( currentContigReadsIterator.hasNext() ) { - shard.fill(currentContigReadsIterator); - nextShard = shard; - } - } - } - } - - /** - * Aggregate all FilePointers for the next contig together into one monolithic FilePointer - * to avoid boundary issues with visiting the same file regions more than once (since more - * granular FilePointers will have regions that overlap with other nearby FilePointers due - * to the nature of BAM indices). - * - * By creating one persistent set of iterators per contig we also avoid boundary artifacts - * in the engine-level downsampling. - * - * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for - * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely - * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should - * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for - * TODO: locus traversals). - */ - private void createNextContigFilePointer() { - currentContigFilePointer = null; - List nextContigFilePointers = new ArrayList(); - - logger.info("Loading BAM index data for next contig"); - - while ( filePointers.hasNext() ) { - - // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that - // it is the ONLY FilePointer we ever encounter - if ( encounteredMonolithicFilePointer ) { - throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); - } - if ( filePointers.peek().isMonolithic() ) { - if ( totalFilePointersConsumed > 0 ) { - throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); - } - encounteredMonolithicFilePointer = true; - logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); - } - - // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the - // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge - if ( nextContigFilePointers.isEmpty() || - (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped && - nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) || - (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { - - nextContigFilePointers.add(filePointers.next()); - totalFilePointersConsumed++; - } - else { - break; // next FilePointer is on a different contig or has different mapped/unmapped status, - // save it for next time - } - } - - if ( ! nextContigFilePointers.isEmpty() ) { - currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser); - } - - if ( currentContigFilePointer != null ) { - logger.info("Done loading BAM index data for next contig"); - logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer)); - } - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); - } - }; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java new file mode 100644 index 000000000..f5b4fba8e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.SAMFileSpan; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Divide up large file pointers containing reads into more manageable subcomponents. + * + * TODO: delete this class once the experimental downsampling engine fork collapses + */ +public class LegacyReadShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + /** + * The cached shard to be returned next. Prefetched in the peekable iterator style. + */ + private Shard nextShard = null; + + /** + * The file pointer currently being processed. + */ + private FilePointer currentFilePointer; + + /** + * Ending position of the last shard in the file. + */ + private Map position = readsDataSource.getCurrentPosition(); + + { + if(filePointers.hasNext()) + currentFilePointer = filePointers.next(); + advance(); + } + + public boolean hasNext() { + return nextShard != null; + } + + public Shard next() { + if(!hasNext()) + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + + private void advance() { + Map shardPosition; + nextShard = null; + + Map selectedReaders = new HashMap(); + while(selectedReaders.size() == 0 && currentFilePointer != null) { + shardPosition = currentFilePointer.fileSpans; + + for(SAMReaderID id: shardPosition.keySet()) { + SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); + selectedReaders.put(id,fileSpan); + } + + if(!isEmpty(selectedReaders)) { + Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + readsDataSource.fillShard(shard); + + if(!shard.isBufferEmpty()) { + nextShard = shard; + break; + } + } + + selectedReaders.clear(); + currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; + } + + position = readsDataSource.getCurrentPosition(); + } + + /** + * Detects whether the list of file spans contain any read data. + * @param selectedSpans Mapping of readers to file spans. + * @return True if file spans are completely empty; false otherwise. + */ + private boolean isEmpty(Map selectedSpans) { + for(SAMFileSpan fileSpan: selectedSpans.values()) { + if(!fileSpan.isEmpty()) + return false; + } + return true; + } + }; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java index 18fafb95d..8cee535b0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -24,20 +24,49 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import net.sf.samtools.GATKBAMFileSpan; -import net.sf.samtools.SAMFileSpan; +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.NoSuchElementException; +import java.util.*; /** - * Divide up large file pointers containing reads into more manageable subcomponents. + * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards. * - * TODO: delete this class once the experimental downsampling engine fork collapses + * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig + * together into one monolithic FilePointer, create one persistent set of read iterators over + * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to + * fill read shards with reads. + * + * This strategy has several important advantages: + * + * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole + * contig will have regions that overlap with other FilePointers on the same contig, due + * to the limited granularity of BAM index data. By creating only one FilePointer per contig, + * we avoid having to track how much of each file region we've visited (as we did in the + * former implementation), we avoid expensive non-sequential access patterns in the files, + * and we avoid having to repeatedly re-create our iterator chain for every small region + * of interest. + * + * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single + * persistent set of read iterators (which include the downsampling iterator(s)) per contig, + * the downsampling process is never interrupted by FilePointer or Shard boundaries, and never + * loses crucial state information while downsampling within a contig. + * + * TODO: There is also at least one important disadvantage: + * + * 1. We load more BAM index data into memory at once, and this work is done upfront before processing + * the next contig, creating a delay before traversal of each contig. This delay may be + * compensated for by the gains listed in #1 above, and we may be no worse off overall in + * terms of total runtime, but we need to verify this empirically. + * + * @author David Roazen */ public class ReadShardBalancer extends ShardBalancer { + + private static Logger logger = Logger.getLogger(ReadShardBalancer.class); + /** * Convert iterators of file pointers into balanced iterators of shards. * @return An iterator over balanced shards. @@ -52,16 +81,27 @@ public class ReadShardBalancer extends ShardBalancer { /** * The file pointer currently being processed. */ - private FilePointer currentFilePointer; + private FilePointer currentContigFilePointer = null; /** - * Ending position of the last shard in the file. + * Iterator over the reads from the current contig's file pointer. The same iterator will be + * used to fill all shards associated with a given file pointer */ - private Map position = readsDataSource.getCurrentPosition(); + private PeekableIterator currentContigReadsIterator = null; + + /** + * How many FilePointers have we pulled from the filePointers iterator? + */ + private int totalFilePointersConsumed = 0; + + /** + * Have we encountered a monolithic FilePointer? + */ + private boolean encounteredMonolithicFilePointer = false; + { - if(filePointers.hasNext()) - currentFilePointer = filePointers.next(); + createNextContigFilePointer(); advance(); } @@ -70,58 +110,117 @@ public class ReadShardBalancer extends ShardBalancer { } public Shard next() { - if(!hasNext()) + if ( ! hasNext() ) throw new NoSuchElementException("No next read shard available"); Shard currentShard = nextShard; advance(); return currentShard; } - public void remove() { - throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); - } - private void advance() { - Map shardPosition; nextShard = null; - Map selectedReaders = new HashMap(); - while(selectedReaders.size() == 0 && currentFilePointer != null) { - shardPosition = currentFilePointer.fileSpans; + // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away + while ( nextShard == null && currentContigFilePointer != null ) { - for(SAMReaderID id: shardPosition.keySet()) { - SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); - selectedReaders.put(id,fileSpan); + // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): + if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) { + + // Close the old, exhausted chain of iterators to release resources + currentContigReadsIterator.close(); + + // Advance to the FilePointer for the next contig + createNextContigFilePointer(); + + // We'll need to create a fresh iterator for this file pointer when we create the first + // shard for it below. + currentContigReadsIterator = null; } - if(!isEmpty(selectedReaders)) { - Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); - readsDataSource.fillShard(shard); + // At this point our currentContigReadsIterator may be null or non-null depending on whether or not + // this is our first shard for this file pointer. + if ( currentContigFilePointer != null ) { + Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped); - if(!shard.isBufferEmpty()) { + // Create a new reads iterator only when we've just advanced to the file pointer for the next + // contig. It's essential that the iterators persist across all shards that share the same contig + // to allow the downsampling to work properly. + if ( currentContigReadsIterator == null ) { + currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); + } + + if ( currentContigReadsIterator.hasNext() ) { + shard.fill(currentContigReadsIterator); nextShard = shard; - break; } } - - selectedReaders.clear(); - currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; } - - position = readsDataSource.getCurrentPosition(); } /** - * Detects whether the list of file spans contain any read data. - * @param selectedSpans Mapping of readers to file spans. - * @return True if file spans are completely empty; false otherwise. + * Aggregate all FilePointers for the next contig together into one monolithic FilePointer + * to avoid boundary issues with visiting the same file regions more than once (since more + * granular FilePointers will have regions that overlap with other nearby FilePointers due + * to the nature of BAM indices). + * + * By creating one persistent set of iterators per contig we also avoid boundary artifacts + * in the engine-level downsampling. + * + * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for + * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely + * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should + * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for + * TODO: locus traversals). */ - private boolean isEmpty(Map selectedSpans) { - for(SAMFileSpan fileSpan: selectedSpans.values()) { - if(!fileSpan.isEmpty()) - return false; + private void createNextContigFilePointer() { + currentContigFilePointer = null; + List nextContigFilePointers = new ArrayList(); + + logger.info("Loading BAM index data for next contig"); + + while ( filePointers.hasNext() ) { + + // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that + // it is the ONLY FilePointer we ever encounter + if ( encounteredMonolithicFilePointer ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); + } + if ( filePointers.peek().isMonolithic() ) { + if ( totalFilePointersConsumed > 0 ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); + } + encounteredMonolithicFilePointer = true; + logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); + } + + // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the + // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge + if ( nextContigFilePointers.isEmpty() || + (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped && + nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) || + (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { + + nextContigFilePointers.add(filePointers.next()); + totalFilePointersConsumed++; + } + else { + break; // next FilePointer is on a different contig or has different mapped/unmapped status, + // save it for next time + } } - return true; + + if ( ! nextContigFilePointers.isEmpty() ) { + currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser); + } + + if ( currentContigFilePointer != null ) { + logger.info("Done loading BAM index data for next contig"); + logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer)); + } + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); } }; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 88de3ac9b..e99814278 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -466,7 +466,7 @@ public class SAMDataSource { /** * Legacy method to fill the given buffering shard with reads. * - * Shard.fill() is used instead of this method when experimental downsampling is enabled + * Shard.fill() is used instead of this method unless legacy downsampling is enabled * * TODO: delete this method once the experimental downsampling engine fork collapses * @@ -638,7 +638,8 @@ public class SAMDataSource { readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), readProperties.getSupplementalFilters(), readProperties.getReadTransformers(), - readProperties.defaultBaseQualities()); + readProperties.defaultBaseQualities(), + shard instanceof LocusShard); } private class BAMCodecIterator implements CloseableIterator { @@ -695,6 +696,7 @@ public class SAMDataSource { * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. * @param supplementalFilters additional filters to apply to the reads. * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. */ protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, @@ -705,7 +707,8 @@ public class SAMDataSource { Boolean noValidationOfReadOrder, Collection supplementalFilters, List readTransformers, - byte defaultBaseQualities) { + byte defaultBaseQualities, + boolean isLocusBasedTraversal ) { // ************************************************************************************************ // // * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // @@ -714,12 +717,26 @@ public class SAMDataSource { wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); - if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) { - wrappedIterator = applyDownsamplingIterator(wrappedIterator); + // If we're using the new downsampling implementation, apply downsampling iterators at this + // point in the read stream for most (but not all) cases + if ( ! readProperties.getDownsamplingMethod().useLegacyDownsampler ) { + + // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers + // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding + // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling + // of individual reads. + boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && + readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readProperties.getDownsamplingMethod().toCoverage != null; + + if ( ! assumeDownstreamLIBSDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } } - // Use the old fractional downsampler only if we're not using experimental downsampling: - if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null ) + // Use the old fractional downsampler only if we're using legacy downsampling: + // TODO: remove this statement (and associated classes) once the downsampling engine fork collapses + if ( readProperties.getDownsamplingMethod().useLegacyDownsampler && downsamplingFraction != null ) wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction); // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, @@ -741,19 +758,37 @@ public class SAMDataSource { } protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) { - if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { - ReadsDownsamplerFactory downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ? - new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage) : - new FractionalDownsamplerFactory(readProperties.getDownsamplingMethod().toFraction); - - return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory); + if ( readProperties.getDownsamplingMethod() == null || + readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { + return wrappedIterator; } - else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { - ReadsDownsampler downsampler = readProperties.getDownsamplingMethod().toCoverage != null ? - new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage) : - new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction); - return new DownsamplingReadsIterator(wrappedIterator, downsampler); + if ( readProperties.getDownsamplingMethod().toFraction != null ) { + + // If we're downsampling to a fraction of reads, there's no point in paying the cost of + // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on + // reads from each sample separately, since the result would be the same as running the + // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator + // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling + // was requested. + + return new DownsamplingReadsIterator(wrappedIterator, + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); + } + else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { + + // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling + // the read stream to run the downsampler on the reads for each individual sample separately if + // BY_SAMPLE downsampling was requested. + + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + return new PerSampleDownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + return new DownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); + } } return wrappedIterator; diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java index ae1d98ce0..b3f636fd6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java @@ -50,9 +50,9 @@ public class DownsamplingMethod { public final Double toFraction; /** - * Use the new experimental downsampling? + * Use the legacy downsampling implementation instead of the newer implementation? */ - public final boolean useExperimentalDownsampling; + public final boolean useLegacyDownsampler; /** * Expresses no downsampling applied at all. @@ -69,11 +69,11 @@ public class DownsamplingMethod { */ public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000; - public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) { + public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useLegacyDownsampler ) { this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; this.toCoverage = toCoverage; this.toFraction = toFraction; - this.useExperimentalDownsampling = useExperimentalDownsampling; + this.useLegacyDownsampler = useLegacyDownsampler; if ( type == DownsampleType.NONE ) { toCoverage = null; @@ -101,19 +101,19 @@ public class DownsamplingMethod { if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); } + } - // Some restrictions only exist for the old downsampling implementation: - if ( ! useExperimentalDownsampling ) { - // By sample downsampling does not work with a fraction of reads in the old downsampling implementation - if( type == DownsampleType.BY_SAMPLE && toFraction != null ) - throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method"); + public void checkCompatibilityWithWalker( Walker walker ) { + boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker; + + if ( ! isLocusTraversal && useLegacyDownsampler && toCoverage != null ) { + throw new UserException.CommandLineException("Downsampling to coverage for read-based traversals (eg., ReadWalkers) is not supported in the legacy downsampling implementation. " + + "The newer downsampling implementation does not have this limitation."); } - // Some restrictions only exist for the new downsampling implementation: - if ( useExperimentalDownsampling ) { - if ( type == DownsampleType.ALL_READS && toCoverage != null ) { - throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation"); - } + if ( isLocusTraversal && ! useLegacyDownsampler && type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException.CommandLineException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not yet supported in the new downsampling implementation (though it is supported for ReadWalkers). " + + "You can run with --use_legacy_downsampler for a broken and poorly-maintained implementation of ALL_READS to-coverage downsampling, but this is not recommended."); } } @@ -124,30 +124,34 @@ public class DownsamplingMethod { builder.append("No downsampling"); } else { - builder.append(String.format("Method: %s ", type)); + builder.append(String.format("Method: %s, ", type)); if ( toCoverage != null ) { - builder.append(String.format("Target Coverage: %d ", toCoverage)); + builder.append(String.format("Target Coverage: %d, ", toCoverage)); } else { - builder.append(String.format("Target Fraction: %.2f ", toFraction)); + builder.append(String.format("Target Fraction: %.2f, ", toFraction)); } - if ( useExperimentalDownsampling ) { - builder.append("Using Experimental Downsampling"); + if ( useLegacyDownsampler ) { + builder.append("Using the legacy downsampling implementation"); + } + else { + builder.append("Using the new downsampling implementation"); } } return builder.toString(); } - public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) { + public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useLegacyDownsampler ) { if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) { return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE, - null, useExperimentalDownsampling); + null, useLegacyDownsampler); } else { - return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling); + // Downsampling is off by default for non-locus-based traversals + return new DownsamplingMethod(DownsampleType.NONE, null, null, useLegacyDownsampler); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java new file mode 100644 index 000000000..008ffde3b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Pass-Through Downsampler: Implementation of the ReadsDownsampler interface that does no + * downsampling whatsoever, and instead simply "passes-through" all the reads it's given. + * Useful for situations where you want to disable downsampling, but still need to use + * the downsampler interface. + * + * @author David Roazen + */ +public class PassThroughDownsampler implements ReadsDownsampler { + + private ArrayList selectedReads; + + public PassThroughDownsampler() { + clear(); + } + + public void submit( T newRead ) { + // All reads pass-through, no reads get downsampled + selectedReads.add(newRead); + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasFinalizedItems() { + return selectedReads.size() > 0; + } + + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + List downsampledItems = selectedReads; + clear(); + return downsampledItems; + } + + public boolean hasPendingItems() { + return false; + } + + public T peekFinalized() { + return selectedReads.isEmpty() ? null : selectedReads.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return 0; + } + + public void signalEndOfInput() { + // NO-OP + } + + public void clear() { + selectedReads = new ArrayList(); + } + + public void reset() { + // NO-OP + } + + public boolean requiresCoordinateSortOrder() { + return false; + } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 38170040a..8d0cefaa4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -43,7 +43,6 @@ import org.broadinstitute.sting.utils.AutoFormattingTime; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; @@ -346,9 +345,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { for ( final TraversalEngine te : allCreatedTraversalEngines) te.shutdown(); - // horrible hack to print nano scheduling information across all nano schedulers, if any were used - NanoScheduler.printCombinedRuntimeProfile(); - allCreatedTraversalEngines.clear(); availableTraversalEngines.clear(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index 6c0dc9769..735a4dce5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,9 +4,9 @@ import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -83,17 +83,18 @@ public class WindowMaker implements Iterable, I this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - // Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested: - this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ? - new PeekableIterator(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames)) + // Use the legacy version of LocusIteratorByState if legacy downsampling was requested: + this.sourceIterator = sourceInfo.getDownsamplingMethod().useLegacyDownsampler ? + new PeekableIterator(new LegacyLocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames)) : - new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames)); + this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) { - this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + this(shard, genomeLocParser, iterator, intervals, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); } public Iterator iterator() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java similarity index 61% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java index 557cbd009..5c833de4c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java @@ -31,14 +31,14 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.Downsampler; -import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.LegacyReservoirDownsampler; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -50,11 +50,11 @@ import java.util.*; /** * Iterator that traverses a SAM File, accumulating information on a per-locus basis */ -public class LocusIteratorByStateExperimental extends LocusIterator { +public class LegacyLocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- // @@ -69,7 +69,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator { private final ArrayList samples; private final ReadStateManager readStates; - protected static class SAMRecordState { + static private class SAMRecordState { SAMRecord read; int readOffset = -1; // how far are we offset from the start of the read bases? int genomeOffset = -1; // how far are we offset from the alignment start on the genome? @@ -213,7 +213,6 @@ public class LocusIteratorByStateExperimental extends LocusIterator { //final boolean DEBUG2 = false && DEBUG; private ReadProperties readInfo; private AlignmentContext nextAlignmentContext; - private boolean performLevelingDownsampling; // ----------------------------------------------------------------------------------------------------------------- // @@ -221,15 +220,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator { // // ----------------------------------------------------------------------------------------------------------------- - public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { + public LegacyLocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator); - - this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null && - readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && - readInfo.getDownsamplingMethod().toCoverage != null; + this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true @@ -290,13 +285,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); - - // TODO: How can you determine here whether the current pileup has been downsampled? boolean hasBeenSampled = false; - for (final String sample : samples) { final Iterator iterator = readStates.iterator(sample); final List pile = new ArrayList(readStates.size(sample)); + hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); int size = 0; // number of elements in this sample's pileup int nDeletions = 0; // number of deletions in this sample's pileup @@ -405,20 +398,34 @@ public class LocusIteratorByStateExperimental extends LocusIterator { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - protected class ReadStateManager { + private class ReadStateManager { private final PeekableIterator iterator; + private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; private final Map readStatesBySample = new HashMap(); + private final int targetCoverage; private int totalReadStates = 0; - public ReadStateManager(Iterator source) { + public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { this.iterator = new PeekableIterator(source); - - for (final String sample : samples) { - readStatesBySample.put(sample, new PerSampleReadStateManager()); + this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; + switch (this.downsamplingMethod.type) { + case BY_SAMPLE: + if (downsamplingMethod.toCoverage == null) + throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); + this.targetCoverage = downsamplingMethod.toCoverage; + break; + default: + this.targetCoverage = Integer.MAX_VALUE; } - samplePartitioner = new SamplePartitioner(); + Map readSelectors = new HashMap(); + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); + } + + samplePartitioner = new SamplePartitioner(readSelectors); } /** @@ -442,6 +449,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator { public void remove() { wrappedIterator.remove(); + totalReadStates--; } }; } @@ -469,6 +477,17 @@ public class LocusIteratorByStateExperimental extends LocusIterator { return readStatesBySample.get(sample).size(); } + /** + * The extent of downsampling; basically, the furthest base out which has 'fallen + * victim' to the downsampler. + * + * @param sample Sample, downsampled independently. + * @return Integer stop of the furthest undownsampled region. + */ + public int getDownsamplingExtent(final String sample) { + return readStatesBySample.get(sample).getDownsamplingExtent(); + } + public SAMRecordState getFirst() { for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); @@ -501,13 +520,61 @@ public class LocusIteratorByStateExperimental extends LocusIterator { samplePartitioner.submitRead(iterator.next()); } } + samplePartitioner.complete(); for (final String sample : samples) { - Collection newReads = samplePartitioner.getReadsForSample(sample); - PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); - addReadsToSample(statesBySample, newReads); - } + ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); + Collection newReads = new ArrayList(aggregator.getSelectedReads()); + + PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + int numReads = statesBySample.size(); + int downsamplingExtent = aggregator.getDownsamplingExtent(); + + if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { + long readLimit = aggregator.getNumReadsSeen(); + addReadsToSample(statesBySample, newReads, readLimit); + statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); + } else { + int[] counts = statesBySample.getCountsPerAlignmentStart(); + int[] updatedCounts = new int[counts.length]; + System.arraycopy(counts, 0, updatedCounts, 0, counts.length); + + boolean readPruned = true; + while (numReads + newReads.size() > targetCoverage && readPruned) { + readPruned = false; + for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { + if (updatedCounts[alignmentStart] > 1) { + updatedCounts[alignmentStart]--; + numReads--; + readPruned = true; + } + } + } + + if (numReads == targetCoverage) { + updatedCounts[0]--; + numReads--; + } + + BitSet toPurge = new BitSet(readStates.size()); + int readOffset = 0; + + for (int i = 0; i < updatedCounts.length; i++) { + int n = counts[i]; + int k = updatedCounts[i]; + + for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) + toPurge.set(readOffset + purgedElement); + + readOffset += counts[i]; + } + downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); + + addReadsToSample(statesBySample, newReads, targetCoverage - numReads); + statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); + } + } samplePartitioner.reset(); } @@ -516,134 +583,380 @@ public class LocusIteratorByStateExperimental extends LocusIterator { * * @param readStates The list of read states to add this collection of reads. * @param reads Reads to add. Selected reads will be pulled from this source. + * @param maxReads Maximum number of reads to add. */ - private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); - + int readCount = 0; for (SAMRecord read : reads) { - SAMRecordState state = new SAMRecordState(read); - state.stepForwardOnGenome(); - newReadStates.add(state); + if (readCount < maxReads) { + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); + readCount++; + } } - readStates.addStatesAtNextAlignmentStart(newReadStates); } - protected class PerSampleReadStateManager implements Iterable { - private List> readStatesByAlignmentStart = new LinkedList>(); - private int thisSampleReadStates = 0; - private Downsampler> levelingDownsampler = - performLevelingDownsampling ? - new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : - null; + private class PerSampleReadStateManager implements Iterable { + private final Queue readStates = new LinkedList(); + private final Deque readStateCounter = new LinkedList(); + private int downsamplingExtent = 0; public void addStatesAtNextAlignmentStart(Collection states) { - if ( states.isEmpty() ) { - return; - } - - readStatesByAlignmentStart.add(new LinkedList(states)); - thisSampleReadStates += states.size(); + readStates.addAll(states); + readStateCounter.add(new Counter(states.size())); totalReadStates += states.size(); - - if ( levelingDownsampler != null ) { - levelingDownsampler.submit(readStatesByAlignmentStart); - levelingDownsampler.signalEndOfInput(); - - thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - - // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); - levelingDownsampler.reset(); - } } public boolean isEmpty() { - return readStatesByAlignmentStart.isEmpty(); + return readStates.isEmpty(); } public SAMRecordState peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); + return readStates.peek(); } public int size() { - return thisSampleReadStates; + return readStates.size(); + } + + public void specifyNewDownsamplingExtent(int downsamplingExtent) { + this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public int[] getCountsPerAlignmentStart() { + int[] counts = new int[readStateCounter.size()]; + int index = 0; + for (Counter counter : readStateCounter) + counts[index++] = counter.getCount(); + return counts; } public Iterator iterator() { return new Iterator() { - private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates = null; - private Iterator currentPositionReadStatesIterator = null; + private Iterator wrappedIterator = readStates.iterator(); public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); + return wrappedIterator.hasNext(); } public SAMRecordState next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); + return wrappedIterator.next(); } public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - totalReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } + wrappedIterator.remove(); + Counter counter = readStateCounter.peek(); + counter.decrement(); + if (counter.getCount() == 0) + readStateCounter.remove(); } }; } + + /** + * Purge the given elements from the bitset. If an element in the bitset is true, purge + * the corresponding read state. + * + * @param elements bits from the set to purge. + * @return the extent of the final downsampled read. + */ + public int purge(final BitSet elements) { + int downsamplingExtent = 0; + + if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + + Iterator readStateIterator = readStates.iterator(); + + Iterator counterIterator = readStateCounter.iterator(); + Counter currentCounter = counterIterator.next(); + + int readIndex = 0; + long alignmentStartCounter = currentCounter.getCount(); + + int toPurge = elements.nextSetBit(0); + int removedCount = 0; + + while (readStateIterator.hasNext() && toPurge >= 0) { + SAMRecordState state = readStateIterator.next(); + downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); + + if (readIndex == toPurge) { + readStateIterator.remove(); + currentCounter.decrement(); + if (currentCounter.getCount() == 0) + counterIterator.remove(); + removedCount++; + toPurge = elements.nextSetBit(toPurge + 1); + } + + readIndex++; + alignmentStartCounter--; + if (alignmentStartCounter == 0 && counterIterator.hasNext()) { + currentCounter = counterIterator.next(); + alignmentStartCounter = currentCounter.getCount(); + } + } + + totalReadStates -= removedCount; + + return downsamplingExtent; + } } } /** - * Note: stores reads by sample ID string, not by sample object + * Note: assuming that, whenever we downsample, we downsample to an integer capacity. */ - private class SamplePartitioner { - private Map> readsBySample; - private long readsSeen = 0; + static private class Counter { + private int count; - public SamplePartitioner() { - readsBySample = new HashMap>(); - - for ( String sample : samples ) { - readsBySample.put(sample, new ArrayList()); - } + public Counter(int count) { + this.count = count; } - public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).add(read); - readsSeen++; + public int getCount() { + return count; } - public long getNumReadsSeen() { - return readsSeen; - } - - public Collection getReadsForSample(String sampleName) { - if ( ! readsBySample.containsKey(sampleName) ) - throw new NoSuchElementException("Sample name not found"); - return readsBySample.get(sampleName); - } - - public void reset() { - for ( Collection perSampleReads : readsBySample.values() ) - perSampleReads.clear(); - readsSeen = 0; + public void decrement() { + count--; } } -} \ No newline at end of file +} + +/** + * Selects reads passed to it based on a criteria decided through inheritance. + * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this. + */ +interface ReadSelector { + /** + * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. + * + * @param read the read to evaluate. + */ + public void submitRead(SAMRecord read); + + /** + * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. + * + * @param read the read previously rejected. + */ + public void notifyReadRejected(SAMRecord read); + + /** + * Signal the selector that read additions are complete. + */ + public void complete(); + + /** + * Retrieve the number of reads seen by this selector so far. + * + * @return number of reads seen. + */ + public long getNumReadsSeen(); + + /** + * Return the number of reads accepted by this selector so far. + * + * @return number of reads selected. + */ + public long getNumReadsSelected(); + + /** + * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the + * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at + * position 3 whose cigar string is 76M, the value of this parameter will be 78. + * + * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. + */ + public int getDownsamplingExtent(); + + /** + * Get the reads selected by this selector. + * + * @return collection of reads selected by this selector. + */ + public Collection getSelectedReads(); + + /** + * Reset this collection to its pre-gathered state. + */ + public void reset(); +} + +/** + * Select every read passed in. + */ +class AllReadsSelector implements ReadSelector { + private Collection reads = new LinkedList(); + private long readsSeen = 0; + private int downsamplingExtent = 0; + + public void submitRead(SAMRecord read) { + reads.add(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public Collection getSelectedReads() { + return reads; + } + + public void reset() { + reads.clear(); + readsSeen = 0; + downsamplingExtent = 0; + } +} + + +/** + * Select N reads randomly from the input stream. + */ +class NRandomReadSelector implements ReadSelector { + private final LegacyReservoirDownsampler reservoir; + private final ReadSelector chainedSelector; + private long readsSeen = 0; + private int downsamplingExtent = 0; + + public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { + this.reservoir = new LegacyReservoirDownsampler((int) readLimit); + this.chainedSelector = chainedSelector; + } + + public void submitRead(SAMRecord read) { + SAMRecord displaced = reservoir.add(read); + if (displaced != null && chainedSelector != null) { + chainedSelector.notifyReadRejected(read); + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); + } + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + } + + public void complete() { + for (SAMRecord read : reservoir.getDownsampledContents()) + chainedSelector.submitRead(read); + if (chainedSelector != null) + chainedSelector.complete(); + } + + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return reservoir.size(); + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public Collection getSelectedReads() { + return reservoir.getDownsampledContents(); + } + + public void reset() { + reservoir.clear(); + downsamplingExtent = 0; + if (chainedSelector != null) + chainedSelector.reset(); + } +} + +/** + * Note: stores reads by sample ID string, not by sample object + */ +class SamplePartitioner implements ReadSelector { + private final Map readsBySample; + private long readsSeen = 0; + + public SamplePartitioner(Map readSelectors) { + readsBySample = readSelectors; + } + + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).submitRead(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).notifyReadRejected(read); + readsSeen++; + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public int getDownsamplingExtent() { + int downsamplingExtent = 0; + for (ReadSelector storage : readsBySample.values()) + downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); + return downsamplingExtent; + } + + public Collection getSelectedReads() { + throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); + } + + public ReadSelector getSelectedReads(String sampleName) { + if (!readsBySample.containsKey(sampleName)) + throw new NoSuchElementException("Sample name not found"); + return readsBySample.get(sampleName); + } + + public void reset() { + for (ReadSelector storage : readsBySample.values()) + storage.reset(); + readsSeen = 0; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 46e84798a..4f8d7d3f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -31,14 +31,11 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.downsampling.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.ReservoirDownsampler; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -54,7 +51,7 @@ public class LocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- // @@ -69,7 +66,7 @@ public class LocusIteratorByState extends LocusIterator { private final ArrayList samples; private final ReadStateManager readStates; - static private class SAMRecordState { + protected static class SAMRecordState { SAMRecord read; int readOffset = -1; // how far are we offset from the start of the read bases? int genomeOffset = -1; // how far are we offset from the alignment start on the genome? @@ -213,6 +210,7 @@ public class LocusIteratorByState extends LocusIterator { //final boolean DEBUG2 = false && DEBUG; private ReadProperties readInfo; private AlignmentContext nextAlignmentContext; + private boolean performDownsampling; // ----------------------------------------------------------------------------------------------------------------- // @@ -224,7 +222,18 @@ public class LocusIteratorByState extends LocusIterator { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); + + // LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're + // downsampling to coverage by sample. SAMDataSource will have refrained from applying + // any downsamplers to the read stream in this case, in the expectation that LIBS will + // manage the downsampling. The reason for this is twofold: performance (don't have to + // split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling + // of reads (eg., using half of a read, and throwing the rest away). + this.performDownsampling = readInfo.getDownsamplingMethod() != null && + readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readInfo.getDownsamplingMethod().toCoverage != null; + + this.readStates = new ReadStateManager(samIterator); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true @@ -285,11 +294,13 @@ public class LocusIteratorByState extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); + + // TODO: How can you determine here whether the current pileup has been downsampled? boolean hasBeenSampled = false; + for (final String sample : samples) { final Iterator iterator = readStates.iterator(sample); final List pile = new ArrayList(readStates.size(sample)); - hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); int size = 0; // number of elements in this sample's pileup int nDeletions = 0; // number of deletions in this sample's pileup @@ -398,34 +409,20 @@ public class LocusIteratorByState extends LocusIterator { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - private class ReadStateManager { + protected class ReadStateManager { private final PeekableIterator iterator; - private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; private final Map readStatesBySample = new HashMap(); - private final int targetCoverage; private int totalReadStates = 0; - public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { + public ReadStateManager(Iterator source) { this.iterator = new PeekableIterator(source); - this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; - switch (this.downsamplingMethod.type) { - case BY_SAMPLE: - if (downsamplingMethod.toCoverage == null) - throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); - this.targetCoverage = downsamplingMethod.toCoverage; - break; - default: - this.targetCoverage = Integer.MAX_VALUE; - } - Map readSelectors = new HashMap(); for (final String sample : samples) { readStatesBySample.put(sample, new PerSampleReadStateManager()); - readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); } - samplePartitioner = new SamplePartitioner(readSelectors); + samplePartitioner = new SamplePartitioner(performDownsampling); } /** @@ -449,7 +446,6 @@ public class LocusIteratorByState extends LocusIterator { public void remove() { wrappedIterator.remove(); - totalReadStates--; } }; } @@ -477,17 +473,6 @@ public class LocusIteratorByState extends LocusIterator { return readStatesBySample.get(sample).size(); } - /** - * The extent of downsampling; basically, the furthest base out which has 'fallen - * victim' to the downsampler. - * - * @param sample Sample, downsampled independently. - * @return Integer stop of the furthest undownsampled region. - */ - public int getDownsamplingExtent(final String sample) { - return readStatesBySample.get(sample).getDownsamplingExtent(); - } - public SAMRecordState getFirst() { for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); @@ -520,61 +505,15 @@ public class LocusIteratorByState extends LocusIterator { samplePartitioner.submitRead(iterator.next()); } } - samplePartitioner.complete(); + + samplePartitioner.doneSubmittingReads(); for (final String sample : samples) { - ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); - - Collection newReads = new ArrayList(aggregator.getSelectedReads()); - + Collection newReads = samplePartitioner.getReadsForSample(sample); PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); - int numReads = statesBySample.size(); - int downsamplingExtent = aggregator.getDownsamplingExtent(); - - if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { - long readLimit = aggregator.getNumReadsSeen(); - addReadsToSample(statesBySample, newReads, readLimit); - statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } else { - int[] counts = statesBySample.getCountsPerAlignmentStart(); - int[] updatedCounts = new int[counts.length]; - System.arraycopy(counts, 0, updatedCounts, 0, counts.length); - - boolean readPruned = true; - while (numReads + newReads.size() > targetCoverage && readPruned) { - readPruned = false; - for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { - if (updatedCounts[alignmentStart] > 1) { - updatedCounts[alignmentStart]--; - numReads--; - readPruned = true; - } - } - } - - if (numReads == targetCoverage) { - updatedCounts[0]--; - numReads--; - } - - BitSet toPurge = new BitSet(readStates.size()); - int readOffset = 0; - - for (int i = 0; i < updatedCounts.length; i++) { - int n = counts[i]; - int k = updatedCounts[i]; - - for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) - toPurge.set(readOffset + purgedElement); - - readOffset += counts[i]; - } - downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); - - addReadsToSample(statesBySample, newReads, targetCoverage - numReads); - statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } + addReadsToSample(statesBySample, newReads); } + samplePartitioner.reset(); } @@ -583,380 +522,140 @@ public class LocusIteratorByState extends LocusIterator { * * @param readStates The list of read states to add this collection of reads. * @param reads Reads to add. Selected reads will be pulled from this source. - * @param maxReads Maximum number of reads to add. */ - private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); - int readCount = 0; + for (SAMRecord read : reads) { - if (readCount < maxReads) { - SAMRecordState state = new SAMRecordState(read); - state.stepForwardOnGenome(); - newReadStates.add(state); - readCount++; - } + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); } + readStates.addStatesAtNextAlignmentStart(newReadStates); } - private class PerSampleReadStateManager implements Iterable { - private final Queue readStates = new LinkedList(); - private final Deque readStateCounter = new LinkedList(); - private int downsamplingExtent = 0; + protected class PerSampleReadStateManager implements Iterable { + private List> readStatesByAlignmentStart = new LinkedList>(); + private int thisSampleReadStates = 0; + private Downsampler> levelingDownsampler = + performDownsampling ? + new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : + null; public void addStatesAtNextAlignmentStart(Collection states) { - readStates.addAll(states); - readStateCounter.add(new Counter(states.size())); + if ( states.isEmpty() ) { + return; + } + + readStatesByAlignmentStart.add(new LinkedList(states)); + thisSampleReadStates += states.size(); totalReadStates += states.size(); + + if ( levelingDownsampler != null ) { + levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.signalEndOfInput(); + + thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + levelingDownsampler.reset(); + } } public boolean isEmpty() { - return readStates.isEmpty(); + return readStatesByAlignmentStart.isEmpty(); } public SAMRecordState peek() { - return readStates.peek(); + return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); } public int size() { - return readStates.size(); - } - - public void specifyNewDownsamplingExtent(int downsamplingExtent) { - this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public int[] getCountsPerAlignmentStart() { - int[] counts = new int[readStateCounter.size()]; - int index = 0; - for (Counter counter : readStateCounter) - counts[index++] = counter.getCount(); - return counts; + return thisSampleReadStates; } public Iterator iterator() { return new Iterator() { - private Iterator wrappedIterator = readStates.iterator(); + private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates = null; + private Iterator currentPositionReadStatesIterator = null; public boolean hasNext() { - return wrappedIterator.hasNext(); + return alignmentStartIterator.hasNext() || + (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); } public SAMRecordState next() { - return wrappedIterator.next(); + if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { + currentPositionReadStates = alignmentStartIterator.next(); + currentPositionReadStatesIterator = currentPositionReadStates.iterator(); + } + + return currentPositionReadStatesIterator.next(); } public void remove() { - wrappedIterator.remove(); - Counter counter = readStateCounter.peek(); - counter.decrement(); - if (counter.getCount() == 0) - readStateCounter.remove(); + currentPositionReadStatesIterator.remove(); + thisSampleReadStates--; + totalReadStates--; + + if ( currentPositionReadStates.isEmpty() ) { + alignmentStartIterator.remove(); + } } }; } + } + } - /** - * Purge the given elements from the bitset. If an element in the bitset is true, purge - * the corresponding read state. - * - * @param elements bits from the set to purge. - * @return the extent of the final downsampled read. - */ - public int purge(final BitSet elements) { - int downsamplingExtent = 0; + /** + * Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler. + * + * Note: stores reads by sample ID string, not by sample object + */ + private class SamplePartitioner { + private Map> readsBySample; - if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + public SamplePartitioner( boolean downsampleReads ) { + readsBySample = new HashMap>(); - Iterator readStateIterator = readStates.iterator(); + for ( String sample : samples ) { + readsBySample.put(sample, + downsampleReads ? new ReservoirDownsampler(readInfo.getDownsamplingMethod().toCoverage) : + new PassThroughDownsampler()); + } + } - Iterator counterIterator = readStateCounter.iterator(); - Counter currentCounter = counterIterator.next(); + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).submit(read); + } - int readIndex = 0; - long alignmentStartCounter = currentCounter.getCount(); + public void doneSubmittingReads() { + for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { + perSampleReads.getValue().signalEndOfInput(); + } + } - int toPurge = elements.nextSetBit(0); - int removedCount = 0; + public Collection getReadsForSample(String sampleName) { + if ( ! readsBySample.containsKey(sampleName) ) + throw new NoSuchElementException("Sample name not found"); - while (readStateIterator.hasNext() && toPurge >= 0) { - SAMRecordState state = readStateIterator.next(); - downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); + return readsBySample.get(sampleName).consumeFinalizedItems(); + } - if (readIndex == toPurge) { - readStateIterator.remove(); - currentCounter.decrement(); - if (currentCounter.getCount() == 0) - counterIterator.remove(); - removedCount++; - toPurge = elements.nextSetBit(toPurge + 1); - } - - readIndex++; - alignmentStartCounter--; - if (alignmentStartCounter == 0 && counterIterator.hasNext()) { - currentCounter = counterIterator.next(); - alignmentStartCounter = currentCounter.getCount(); - } - } - - totalReadStates -= removedCount; - - return downsamplingExtent; + public void reset() { + for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { + perSampleReads.getValue().clear(); + perSampleReads.getValue().reset(); } } } - - /** - * Note: assuming that, whenever we downsample, we downsample to an integer capacity. - */ - static private class Counter { - private int count; - - public Counter(int count) { - this.count = count; - } - - public int getCount() { - return count; - } - - public void decrement() { - count--; - } - } -} - -/** - * Selects reads passed to it based on a criteria decided through inheritance. - * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this. - */ -interface ReadSelector { - /** - * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. - * - * @param read the read to evaluate. - */ - public void submitRead(SAMRecord read); - - /** - * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. - * - * @param read the read previously rejected. - */ - public void notifyReadRejected(SAMRecord read); - - /** - * Signal the selector that read additions are complete. - */ - public void complete(); - - /** - * Retrieve the number of reads seen by this selector so far. - * - * @return number of reads seen. - */ - public long getNumReadsSeen(); - - /** - * Return the number of reads accepted by this selector so far. - * - * @return number of reads selected. - */ - public long getNumReadsSelected(); - - /** - * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the - * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at - * position 3 whose cigar string is 76M, the value of this parameter will be 78. - * - * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. - */ - public int getDownsamplingExtent(); - - /** - * Get the reads selected by this selector. - * - * @return collection of reads selected by this selector. - */ - public Collection getSelectedReads(); - - /** - * Reset this collection to its pre-gathered state. - */ - public void reset(); -} - -/** - * Select every read passed in. - */ -class AllReadsSelector implements ReadSelector { - private Collection reads = new LinkedList(); - private long readsSeen = 0; - private int downsamplingExtent = 0; - - public void submitRead(SAMRecord read) { - reads.add(read); - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - readsSeen++; - downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); - } - - public void complete() { - // NO-OP. - } - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return readsSeen; - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public Collection getSelectedReads() { - return reads; - } - - public void reset() { - reads.clear(); - readsSeen = 0; - downsamplingExtent = 0; - } -} - - -/** - * Select N reads randomly from the input stream. - */ -class NRandomReadSelector implements ReadSelector { - private final ReservoirDownsampler reservoir; - private final ReadSelector chainedSelector; - private long readsSeen = 0; - private int downsamplingExtent = 0; - - public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { - this.reservoir = new ReservoirDownsampler((int) readLimit); - this.chainedSelector = chainedSelector; - } - - public void submitRead(SAMRecord read) { - SAMRecord displaced = reservoir.add(read); - if (displaced != null && chainedSelector != null) { - chainedSelector.notifyReadRejected(read); - downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); - } - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - readsSeen++; - } - - public void complete() { - for (SAMRecord read : reservoir.getDownsampledContents()) - chainedSelector.submitRead(read); - if (chainedSelector != null) - chainedSelector.complete(); - } - - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return reservoir.size(); - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public Collection getSelectedReads() { - return reservoir.getDownsampledContents(); - } - - public void reset() { - reservoir.clear(); - downsamplingExtent = 0; - if (chainedSelector != null) - chainedSelector.reset(); - } -} - -/** - * Note: stores reads by sample ID string, not by sample object - */ -class SamplePartitioner implements ReadSelector { - private final Map readsBySample; - private long readsSeen = 0; - - public SamplePartitioner(Map readSelectors) { - readsBySample = readSelectors; - } - - public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).submitRead(read); - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).notifyReadRejected(read); - readsSeen++; - } - - public void complete() { - // NO-OP. - } - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return readsSeen; - } - - public int getDownsamplingExtent() { - int downsamplingExtent = 0; - for (ReadSelector storage : readsBySample.values()) - downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); - return downsamplingExtent; - } - - public Collection getSelectedReads() { - throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); - } - - public ReadSelector getSelectedReads(String sampleName) { - if (!readsBySample.containsKey(sampleName)) - throw new NoSuchElementException("Sample name not found"); - return readsBySample.get(sampleName); - } - - public void reset() { - for (ReadSelector storage : readsBySample.values()) - storage.reset(); - readsSeen = 0; - } - -} +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index 7ae2bb453..605a6680f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -343,7 +343,7 @@ public class GATKReport { GATKReportTable table = tables.firstEntry().getValue(); if ( table.getNumColumns() != values.length ) - throw new ReviewedStingException("The number of arguments in writeRow() " + values.length + " must match the number of columns in the table" + table.getNumColumns()); + throw new ReviewedStingException("The number of arguments in writeRow (" + values.length + ") must match the number of columns in the table (" + table.getNumColumns() + ")" ); final int rowIndex = table.getNumRows(); for ( int i = 0; i < values.length; i++ ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index ae9b01f2d..a8ee4afde 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -59,8 +59,8 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { public enum Model { SNP, INDEL, - GeneralPloidySNP, - GeneralPloidyINDEL, + GENERALPLOIDYSNP, + GENERALPLOIDYINDEL, BOTH } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index cc086b148..8f2588679 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -52,7 +52,7 @@ import java.util.*; public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - private static final String GPSTRING = "GeneralPloidy"; + private static final String GPSTRING = "GENERALPLOIDY"; public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; @@ -79,6 +79,7 @@ public class UnifiedGenotyperEngine { // the model used for calculating genotypes private ThreadLocal> glcm = new ThreadLocal>(); + private final List modelsToUse = new ArrayList(2); // the model used for calculating p(non-ref) private ThreadLocal afcm = new ThreadLocal(); @@ -134,6 +135,8 @@ public class UnifiedGenotyperEngine { computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); filter.add(LOW_QUAL_FILTER_NAME); + + determineGLModelsToUse(); } /** @@ -286,7 +289,7 @@ public class UnifiedGenotyperEngine { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); + return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -634,48 +637,51 @@ public class UnifiedGenotyperEngine { (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); } + private void determineGLModelsToUse() { + + String modelPrefix = ""; + if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY ) + modelPrefix = GPSTRING; + + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { + modelPrefix += UAC.GLmodel.name().toUpperCase().replaceAll("BOTH",""); + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); + } + else { + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); + } + } + // decide whether we are currently processing SNPs, indels, neither, or both private List getGLModelsToUse(final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext rawContext) { - final List models = new ArrayList(2); - String modelPrefix = ""; - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) - modelPrefix = UAC.GLmodel.name().toUpperCase().replaceAll("BOTH",""); + if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) + return modelsToUse; - if (!UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) - modelPrefix = GPSTRING + modelPrefix; + // if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested + final List GGAmodel = new ArrayList(1); + final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); + if ( vcInput == null ) + return GGAmodel; // no work to be done - // if we're genotyping given alleles and we have a requested SNP at this position, do SNP - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); - if ( vcInput == null ) - return models; - - if ( vcInput.isSNP() ) { - // ignore SNPs if the user chose INDEL mode only - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") || UAC.GLmodel.name().toUpperCase().contains("SNP") ) - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); - } - else if ( vcInput.isIndel() || vcInput.isMixed() ) { - // ignore INDELs if the user chose SNP mode only - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") || UAC.GLmodel.name().toUpperCase().contains("INDEL") ) - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); - } - // No support for other types yet + if ( vcInput.isSNP() ) { + // use the SNP model unless the user chose INDEL mode only + if ( modelsToUse.size() == 2 || modelsToUse.get(0).name().endsWith("SNP") ) + GGAmodel.add(modelsToUse.get(0)); } - else { - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); - } - else { - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); - } + else if ( vcInput.isIndel() || vcInput.isMixed() ) { + // use the INDEL model unless the user chose SNP mode only + if ( modelsToUse.size() == 2 ) + GGAmodel.add(modelsToUse.get(1)); + else if ( modelsToUse.get(0).name().endsWith("INDEL") ) + GGAmodel.add(modelsToUse.get(0)); } + // No support for other types yet - return models; + return GGAmodel; } public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index dbb0e8cdd..142469077 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -52,7 +51,7 @@ public class AFCalcResult { private final double[] log10PriorsOfAC; private final double[] log10PosteriorsOfAC; - private final Map log10pNonRefByAllele; + private final Map log10pRefByAllele; /** * The AC values for all ALT alleles at the MLE @@ -74,16 +73,16 @@ public class AFCalcResult { final List allelesUsedInGenotyping, final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC, - final Map log10pNonRefByAllele) { + final Map log10pRefByAllele) { if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping); if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null"); if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size()); if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations); if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2"); if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2"); - if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); - if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); - if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( log10pRefByAllele == null ) throw new IllegalArgumentException("log10pRefByAllele cannot be null"); + if ( log10pRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pRefByAllele has the wrong number of elements: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( ! allelesUsedInGenotyping.containsAll(log10pRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pRefByAllele doesn't contain all of the alleles used in genotyping: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); @@ -94,7 +93,7 @@ public class AFCalcResult { this.log10LikelihoodsOfAC = Arrays.copyOf(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES); this.log10PriorsOfAC = Arrays.copyOf(log10PriorsOfAC, LOG_10_ARRAY_SIZES); this.log10PosteriorsOfAC = computePosteriors(log10LikelihoodsOfAC, log10PriorsOfAC); - this.log10pNonRefByAllele = new HashMap(log10pNonRefByAllele); + this.log10pRefByAllele = new HashMap(log10pRefByAllele); } /** @@ -104,7 +103,7 @@ public class AFCalcResult { * @return */ public AFCalcResult withNewPriors(final double[] log10PriorsOfAC) { - return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); } /** @@ -219,7 +218,7 @@ public class AFCalcResult { public String toString() { final List byAllele = new LinkedList(); for ( final Allele a : getAllelesUsedInGenotyping() ) - if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a))); + if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFEq0ForAllele(a))); return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele)); } @@ -231,13 +230,16 @@ public class AFCalcResult { * And that log10minPNonRef is -3. * We are considered polymorphic since 10^-5 < 10^-3 => -5 < -3 * + * Note that log10minPNonRef is really the minimum confidence, scaled as an error rate, so + * if you want to be 99% confidence, then log10PNonRef should be log10(0.01) = -2. + * * @param log10minPNonRef the log10 scaled min pr of being non-ref to be considered polymorphic * * @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0 */ @Requires("MathUtils.goodLog10Probability(log10minPNonRef)") public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { - return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; + return getLog10PosteriorOfAFEq0ForAllele(allele) < log10minPNonRef; } /** @@ -245,7 +247,7 @@ public class AFCalcResult { */ public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); - final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); + final double log10Threshold = minPNonRefPhredScaledQual / -10; return isPolymorphic(allele, log10Threshold); } @@ -263,7 +265,16 @@ public class AFCalcResult { } /** - * Returns the log10 probability that allele is segregating + * Returns the log10 probability that allele is not segregating + * + * Note that this function is p not segregating so that we can store + * internally the log10 value of AF == 0, which grows very quickly + * negative and yet has sufficient resolution for high confidence tests. + * For example, if log10pRef == -100, not an unreasonably high number, + * if we tried to store log10pNonRef we'd be looking at 1 - 10^-100, which + * quickly underflows to 1. So the logic here is backward from what + * you really want (the p of segregating) but we do that for numerical + * reasons * * Unlike the sites-level annotation, this calculation is specific to allele, and can be * used to separately determine how much evidence there is that allele is independently @@ -272,11 +283,11 @@ public class AFCalcResult { * evidence for one allele but not so much for any other allele * * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping - * @return the log10 probability that allele is segregating at this site + * @return the log10 probability that allele is not segregating at this site */ @Ensures("MathUtils.goodLog10Probability(result)") - public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { - final Double log10pNonRef = log10pNonRefByAllele.get(allele); + public double getLog10PosteriorOfAFEq0ForAllele(final Allele allele) { + final Double log10pNonRef = log10pRefByAllele.get(allele); if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); return log10pNonRef; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java index f13fe4429..b138ddf70 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -77,7 +77,7 @@ public class ExactCallLogger implements Cloneable { for ( final Allele allele : result.getAllelesUsedInGenotyping() ) { if ( allele.isNonReference() ) { printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele)); - printCallElement(vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele)); + printCallElement(vc, "pRefByAllele", allele, result.getLog10PosteriorOfAFEq0ForAllele(allele)); } } @@ -123,7 +123,7 @@ public class ExactCallLogger implements Cloneable { final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true); final List mle = new ArrayList(); - final Map log10pNonRefByAllele = new HashMap(); + final Map log10pRefByAllele = new HashMap(); long runtimeNano = -1; GenomeLoc currentLoc = null; @@ -148,7 +148,7 @@ public class ExactCallLogger implements Cloneable { builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[]{})); - final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); + final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; @@ -165,9 +165,9 @@ public class ExactCallLogger implements Cloneable { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); - } else if (variable.equals("pNonRefByAllele")) { + } else if (variable.equals("pRefByAllele")) { final Allele a = Allele.create(key); - log10pNonRefByAllele.put(a, Double.valueOf(value)); + log10pRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index d0b801a20..937ef2ffc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -125,8 +125,8 @@ import java.util.*; */ final List supporting; - private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pNonRefByAllele, List supporting) { - super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pRefByAllele, List supporting) { + super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); this.supporting = supporting; } } @@ -323,7 +323,7 @@ import java.util.*; final int nAltAlleles = sortedResultsWithThetaNPriors.size(); final int[] alleleCountsOfMLE = new int[nAltAlleles]; final double[] log10PriorsOfAC = new double[2]; - final Map log10pNonRefByAllele = new HashMap(nAltAlleles); + final Map log10pRefByAllele = new HashMap(nAltAlleles); // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs double log10PosteriorOfACEq0Sum = 0.0; @@ -348,7 +348,7 @@ import java.util.*; log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior - log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); + log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0()); // trivial -- update the number of evaluations nEvaluations += sortedResultWithThetaNPriors.nEvaluations; @@ -384,6 +384,6 @@ import java.util.*; MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // priors incorporate multiple alt alleles, must be normalized MathUtils.normalizeFromLog10(log10PriorsOfAC, true), - log10pNonRefByAllele, sortedResultsWithThetaNPriors); + log10pRefByAllele, sortedResultsWithThetaNPriors); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index fc26111e0..67cc79646 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -30,13 +30,13 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); - final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; - final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); + final double log10PRef = log10Posteriors[1] > log10Posteriors[0] ? MathUtils.LOG10_P_OF_ZERO : 0.0; + final Map log10pRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PRef); return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), MathUtils.normalizeFromLog10(log10Likelihoods, true), MathUtils.normalizeFromLog10(log10Priors, true), - log10pNonRefByAllele); + log10pRefByAllele); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index b82ec1d29..ad6361a3f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -165,14 +165,14 @@ final class StateTracker { final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + final Map log10pRefByAllele = new HashMap(allelesUsedInGenotyping.size()); for ( int i = 0; i < subACOfMLE.length; i++ ) { final Allele allele = allelesUsedInGenotyping.get(i+1); - final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was - log10pNonRefByAllele.put(allele, log10PNonRef); + final double log10PRef = alleleCountsOfMAP[i] > 0 ? -10000 : 0; // TODO -- a total hack but in effect what the old behavior was + log10pRefByAllele.put(allele, log10PRef); } - return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pRefByAllele); } // -------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 7b797432d..848aaf8a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -287,6 +287,9 @@ public class PairHMMIndelErrorModel { if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; } + else if (startLocationInRefForHaplotypes > ref.getWindow().getStop()) { + startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely; + } if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context @@ -338,6 +341,8 @@ public class PairHMMIndelErrorModel { if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) startLocationInRefForHaplotypes = haplotype.getStartPosition(); + else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) + startLocationInRefForHaplotypes = haplotype.getStopPosition(); final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); @@ -347,8 +352,6 @@ public class PairHMMIndelErrorModel { System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index eda43e6a5..7d848d0d4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -135,6 +135,9 @@ public class ReadBackedPhasing extends RodWalkerInput - *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals present in one bam but not the other. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedCoverage \
- *   -o output.intervals
- * 
- * - * @author ebanks - */ -@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) -@Hidden -public class AssessReducedCoverage extends LocusWalker implements TreeReducible { - - private static final String original = "original"; - private static final String reduced = "reduced"; - - @Output - protected PrintStream out; - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) - public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; - - public void initialize() {} - - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if ( tracker == null ) - return null; - - Set tags = getAllTags(context.getBasePileup()); - return (tags.contains(original) && !tags.contains(reduced)) || - (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; - } - - private Set getAllTags(final ReadBackedPileup pileup) { - - final Set tags = new HashSet(10); - - for ( final PileupElement p : pileup ) { - if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) - tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); - } - - return tags; - } - - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - public GenomeLoc reduceInit() { - return null; - } - - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java deleted file mode 100644 index 78bcf1228..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ /dev/null @@ -1,173 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.List; - -/** - * Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of - * the reduced bam are above sufficient threshold) - * - *

Input

- *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedQuals \
- *   -o output.intervals
- * 
- * - * @author ami - */ - -public class AssessReducedQuals extends LocusWalker implements TreeReducible { - - private static final String reduced = "reduced"; - private static final String original = "original"; - private static final int originalQualsIndex = 0; - private static final int reducedQualsIndex = 1; - - @Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false) - public int sufficientQualSum = 600; - - @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > epsilon*Quals_original_bam we output this interval", required = false) - public int qual_epsilon = 0; - - @Argument(fullName = "debugLevel", shortName = "debug", doc = "debug mode on") // TODO -- best to make this optional - public int debugLevel = 0; // TODO -- best to make this an enum or boolean - - @Output - protected PrintStream out; - - public void initialize() { - if (debugLevel != 0) - out.println(" Debug mode" + - "Debug:\tsufficientQualSum: "+sufficientQualSum+ "\n " + - "Debug:\tqual_epsilon: "+qual_epsilon); - } - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Override - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return null; - - boolean reportLocus; - final int[] quals = getPileupQuals(context.getBasePileup()); - if (debugLevel != 0) - out.println("Debug:\tLocus Quals\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]); - final int epsilon = MathUtils.fastRound(quals[originalQualsIndex]*qual_epsilon); - final int calcOriginalQuals = Math.min(quals[originalQualsIndex],sufficientQualSum); - final int calcReducedQuals = Math.min(quals[reducedQualsIndex],sufficientQualSum); - final int OriginalReducedQualDiff = calcOriginalQuals - calcReducedQuals; - reportLocus = OriginalReducedQualDiff > epsilon || OriginalReducedQualDiff < -1*epsilon; - if(debugLevel != 0 && reportLocus) - out.println("Debug:\tEmited Locus\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]+"\tepsilon\t"+epsilon+"\tdiff\t"+OriginalReducedQualDiff); - - return reportLocus ? ref.getLocus() : null; - } - - private final int[] getPileupQuals(final ReadBackedPileup readPileup) { - - final int[] quals = new int[2]; - String[] printPileup = {"Debug 2:\toriginal pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n", - "Debug 2:\t reduced pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n"}; - - for( PileupElement p : readPileup ){ - final List tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags(); - if ( isGoodRead(p,tags) ){ - final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount(); - final int tagIndex = getTagIndex(tags); - quals[tagIndex] += tempQual; - if(debugLevel == 2) - printPileup[tagIndex] += "\tDebug 2: ("+p+")\tMQ="+p.getMappingQual()+":QU="+p.getQual()+":RC="+p.getRepresentativeCount()+":OS="+p.getOffset()+"\n"; - } - } - if(debugLevel == 2){ - out.println(printPileup[originalQualsIndex]); - out.println(printPileup[reducedQualsIndex]); - } - return quals; - } - - // TODO -- arguments/variables should be final, not method declaration - private final boolean isGoodRead(PileupElement p, List tags){ - // TODO -- this isn't quite right. You don't need the tags here. Instead, you want to check whether the read itself (which - // TODO -- you can get from the PileupElement) is a reduced read (not all reads from the reduced bam are reduced) and only - // TODO -- for them do you want to ignore that min mapping quality cutoff (but you *do* still want the min base cutoff). - return !p.isDeletion() && (tags.contains(reduced) || (tags.contains(original) && (int)p.getQual() >= 20 && p.getMappingQual() >= 20)); - } - - private final int getTagIndex(List tags){ - return tags.contains(reduced) ? 1 : 0; - } - - @Override - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - @Override - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - @Override - public GenomeLoc reduceInit() { - return null; - } - - @Override - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index b1d8dc91d..68fac7631 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -134,7 +134,7 @@ public class CombineVariants extends RodWalker implements Tree protected VariantContextWriter vcfWriter = null; @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) - public VariantContextUtils.GenotypeMergeType genotypeMergeOption = VariantContextUtils.GenotypeMergeType.PRIORITIZE; + public VariantContextUtils.GenotypeMergeType genotypeMergeOption = null; @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; @@ -200,13 +200,13 @@ public class CombineVariants extends RodWalker implements Tree } else logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); - if ( PRIORITY_STRING == null ) { + validateAnnotateUnionArguments(); + if ( PRIORITY_STRING == null && genotypeMergeOption == null) { genotypeMergeOption = VariantContextUtils.GenotypeMergeType.UNSORTED; //PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); Deleted by Ami (7/10/12) - logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING); + logger.info("Priority string not provided, using arbitrary genotyping order: "+priority); } - validateAnnotateUnionArguments(); samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); if ( SET_KEY.toLowerCase().equals("null") ) @@ -228,22 +228,22 @@ public class CombineVariants extends RodWalker implements Tree if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); - if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE ) + if ( PRIORITY_STRING != null){ priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); - else - priority = new ArrayList(rodNames); + if ( rodNames.size() != priority.size() ) + throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); - if ( rodNames.size() != priority.size() ) - throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); + if ( ! rodNames.containsAll(priority) ) + throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); + } - if ( ! rodNames.containsAll(priority) ) - throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) // RodWalkers can make funky map calls return 0; + Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus Collection vcs = tracker.getValues(variants, context.getLocation()); @@ -290,13 +290,13 @@ public class CombineVariants extends RodWalker implements Tree for (VariantContext.Type type : VariantContext.Type.values()) { if (VCsByType.containsKey(type)) mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), - priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + priority, rodNames.size() , filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } } else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs, - priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 4d2c26a79..ec82cdef2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -495,4 +495,14 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome public long sizeOfOverlap( final GenomeLoc that ) { return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) + 1L : 0L ); } + + /** + * Returns the maximum GenomeLoc of this and other + * @param other another non-null genome loc + * @return the max of this and other + */ + public GenomeLoc max(final GenomeLoc other) { + final int cmp = this.compareTo(other); + return cmp == -1 ? other : this; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java similarity index 94% rename from public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java rename to public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java index a758df431..ba863310d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java @@ -8,6 +8,8 @@ import java.util.Collection; import java.util.Iterator; /** + * THIS IMPLEMENTATION IS BROKEN AND WILL BE REMOVED ONCE THE DOWNSAMPLING ENGINE FORK COLLAPSES + * * Randomly downsample from a stream of elements. This algorithm is a direct, * naive implementation of reservoir downsampling as described in "Random Downsampling * with a Reservoir" (Vitter 1985). At time of writing, this paper is located here: @@ -16,7 +18,7 @@ import java.util.Iterator; * @author mhanna * @version 0.1 */ -public class ReservoirDownsampler { +public class LegacyReservoirDownsampler { /** * The reservoir of elements tracked by this downsampler. */ @@ -31,7 +33,7 @@ public class ReservoirDownsampler { * Create a new downsampler with the given source iterator and given comparator. * @param maxElements What is the maximum number of reads that can be returned in any call of this */ - public ReservoirDownsampler(final int maxElements) { + public LegacyReservoirDownsampler(final int maxElements) { if(maxElements < 0) throw new ReviewedStingException("Unable to work with an negative size collection of elements"); this.reservoir = new ArrayList(maxElements); diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java index bd99a9266..0e0237412 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; -import org.broadinstitute.sting.utils.SimpleTimer; import java.util.Iterator; import java.util.concurrent.BlockingQueue; @@ -19,11 +18,6 @@ class InputProducer implements Runnable { */ final Iterator inputReader; - /** - * Our timer (may be null) that we use to track our input costs - */ - final SimpleTimer inputTimer; - /** * Where we put our input values for consumption */ @@ -51,16 +45,13 @@ class InputProducer implements Runnable { public InputProducer(final Iterator inputReader, final MultiThreadedErrorTracker errorTracker, - final SimpleTimer inputTimer, final BlockingQueue outputQueue) { if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); if ( errorTracker == null ) throw new IllegalArgumentException("errorTracker cannot be null"); - if ( inputTimer == null ) throw new IllegalArgumentException("inputTimer cannot be null"); if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null"); this.inputReader = inputReader; this.errorTracker = errorTracker; - this.inputTimer = inputTimer; this.outputQueue = outputQueue; } @@ -94,16 +85,15 @@ class InputProducer implements Runnable { * @throws InterruptedException */ private synchronized InputType readNextItem() throws InterruptedException { - inputTimer.restart(); if ( ! inputReader.hasNext() ) { // we are done, mark ourselves as such and return null readLastValue = true; - inputTimer.stop(); return null; } else { // get the next value, and return it final InputType input = inputReader.next(); - inputTimer.stop(); + if ( input == null ) + throw new IllegalStateException("inputReader.next() returned a null value, breaking our contract"); nRead++; return input; } @@ -121,6 +111,9 @@ class InputProducer implements Runnable { final InputType value = readNextItem(); if ( value == null ) { + if ( ! readLastValue ) + throw new IllegalStateException("value == null but readLastValue is false!"); + // add the EOF object so our consumer knows we are done in all inputs // note that we do not increase inputID here, so that variable indicates the ID // of the last real value read from the queue @@ -133,8 +126,10 @@ class InputProducer implements Runnable { } latch.countDown(); - } catch (Exception ex) { + } catch (Throwable ex) { errorTracker.notifyOfError(ex); + } finally { +// logger.info("Exiting input thread readLastValue = " + readLastValue); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java deleted file mode 100644 index 0926b4c50..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java +++ /dev/null @@ -1,67 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.AutoFormattingTime; -import org.broadinstitute.sting.utils.SimpleTimer; - -/** - * Holds runtime profile (input, read, map) times as tracked by NanoScheduler - * - * User: depristo - * Date: 9/10/12 - * Time: 8:31 PM - */ -public class NSRuntimeProfile { - final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); - final SimpleTimer inputTimer = new SimpleTimer("input"); - final SimpleTimer mapTimer = new SimpleTimer("map"); - final SimpleTimer reduceTimer = new SimpleTimer("reduce"); - - /** - * Combine the elapsed time information from other with this profile - * - * @param other a non-null profile - */ - public void combine(final NSRuntimeProfile other) { - outsideSchedulerTimer.addElapsed(other.outsideSchedulerTimer); - inputTimer.addElapsed(other.inputTimer); - mapTimer.addElapsed(other.mapTimer); - reduceTimer.addElapsed(other.reduceTimer); - } - - /** - * Print the runtime profiling to logger - * - * @param logger - */ - public void log(final Logger logger) { - log1(logger, "Input time", inputTimer); - log1(logger, "Map time", mapTimer); - log1(logger, "Reduce time", reduceTimer); - log1(logger, "Outside time", outsideSchedulerTimer); - } - - /** - * @return the total runtime for all functions of this nano scheduler - */ - //@Ensures("result >= 0.0") - public double totalRuntimeInSeconds() { - return inputTimer.getElapsedTime() - + mapTimer.getElapsedTime() - + reduceTimer.getElapsedTime() - + outsideSchedulerTimer.getElapsedTime(); - } - - /** - * Print to logger.info timing information from timer, with name label - * - * @param label the name of the timer to display. Should be human readable - * @param timer the timer whose elapsed time we will display - */ - //@Requires({"label != null", "timer != null"}) - private void log1(final Logger logger, final String label, final SimpleTimer timer) { - final double myTimeInSec = timer.getElapsedTime(); - final double myTimePercent = myTimeInSec / totalRuntimeInSeconds() * 100; - logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index d83a23c0f..4cc91faa4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -57,16 +57,6 @@ public class NanoScheduler { boolean debug = false; private NSProgressFunction progressFunction = null; - /** - * Tracks the combined runtime profiles across all created nano schedulers - */ - final static private NSRuntimeProfile combinedNSRuntimeProfiler = new NSRuntimeProfile(); - - /** - * The profile specific to this nano scheduler - */ - final private NSRuntimeProfile myNSRuntimeProfile = new NSRuntimeProfile(); - /** * Create a new nanoscheduler with the desire characteristics requested by the argument * @@ -94,9 +84,6 @@ public class NanoScheduler { this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); this.masterExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-master-thread-%d")); } - - // start timing the time spent outside of the nanoScheduler - myNSRuntimeProfile.outsideSchedulerTimer.start(); } /** @@ -123,11 +110,6 @@ public class NanoScheduler { * After this call, execute cannot be invoked without throwing an error */ public void shutdown() { - myNSRuntimeProfile.outsideSchedulerTimer.stop(); - - // add my timing information to the combined NS runtime profile - combinedNSRuntimeProfiler.combine(myNSRuntimeProfile); - if ( nThreads > 1 ) { shutdownExecutor("inputExecutor", inputExecutor); shutdownExecutor("mapExecutor", mapExecutor); @@ -137,19 +119,6 @@ public class NanoScheduler { shutdown = true; } - public void printRuntimeProfile() { - myNSRuntimeProfile.log(logger); - } - - public static void printCombinedRuntimeProfile() { - if ( combinedNSRuntimeProfiler.totalRuntimeInSeconds() > 0.1 ) - combinedNSRuntimeProfiler.log(logger); - } - - protected double getTotalRuntime() { - return myNSRuntimeProfile.totalRuntimeInSeconds(); - } - /** * Helper function to cleanly shutdown an execution service, checking that the execution * state is clean when it's done. @@ -245,8 +214,6 @@ public class NanoScheduler { if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); - myNSRuntimeProfile.outsideSchedulerTimer.stop(); - ReduceType result; if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { result = executeSingleThreaded(inputReader, map, initialValue, reduce); @@ -254,7 +221,6 @@ public class NanoScheduler { result = executeMultiThreaded(inputReader, map, initialValue, reduce); } - myNSRuntimeProfile.outsideSchedulerTimer.restart(); return result; } @@ -273,28 +239,19 @@ public class NanoScheduler { while ( true ) { // start timer to ensure that both hasNext and next are caught by the timer - myNSRuntimeProfile.inputTimer.restart(); if ( ! inputReader.hasNext() ) { - myNSRuntimeProfile.inputTimer.stop(); break; } else { final InputType input = inputReader.next(); - myNSRuntimeProfile.inputTimer.stop(); // map - myNSRuntimeProfile.mapTimer.restart(); - final long preMapTime = LOG_MAP_TIMES ? 0 : myNSRuntimeProfile.mapTimer.currentTimeNano(); final MapType mapValue = map.apply(input); - if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (myNSRuntimeProfile.mapTimer.currentTimeNano() - preMapTime)); - myNSRuntimeProfile.mapTimer.stop(); - if ( i++ % this.bufferSize == 0 && progressFunction != null ) + if ( progressFunction != null ) progressFunction.progress(input); // reduce - myNSRuntimeProfile.reduceTimer.restart(); sum = reduce.apply(mapValue, sum); - myNSRuntimeProfile.reduceTimer.stop(); } } @@ -320,6 +277,7 @@ public class NanoScheduler { while ( true ) { // check that no errors occurred while we were waiting handleErrors(); +// checkForDeadlocks(); try { final ReduceType result = reduceResult.get(100, TimeUnit.MILLISECONDS); @@ -341,6 +299,26 @@ public class NanoScheduler { } } +// private void checkForDeadlocks() { +// if ( deadLockCheckCounter++ % 100 == 0 ) { +// logger.info("Checking for deadlocks..."); +// final ThreadMXBean bean = ManagementFactory.getThreadMXBean(); +// final long[] threadIds = bean.findDeadlockedThreads(); // Returns null if no threads are deadlocked. +// +// if (threadIds != null) { +// final ThreadInfo[] infos = bean.getThreadInfo(threadIds); +// +// logger.error("!!! Deadlock detected !!!!"); +// for (final ThreadInfo info : infos) { +// logger.error("Thread " + info); +// for ( final StackTraceElement elt : info.getStackTrace() ) { +// logger.error("\t" + elt.toString()); +// } +// } +// } +// } +// } + private void handleErrors() { if ( errorTracker.hasAnErrorOccurred() ) { masterExecutor.shutdownNow(); @@ -380,7 +358,7 @@ public class NanoScheduler { // Create the input producer and start it running final InputProducer inputProducer = - new InputProducer(inputReader, errorTracker, myNSRuntimeProfile.inputTimer, inputQueue); + new InputProducer(inputReader, errorTracker, inputQueue); inputExecutor.submit(inputProducer); // a priority queue that stores up to bufferSize elements @@ -389,7 +367,7 @@ public class NanoScheduler { new PriorityBlockingQueue>(); final Reducer reducer - = new Reducer(reduce, errorTracker, myNSRuntimeProfile.reduceTimer, initialValue); + = new Reducer(reduce, errorTracker, initialValue); try { int nSubmittedJobs = 0; @@ -408,7 +386,8 @@ public class NanoScheduler { // wait for all of the input and map threads to finish return waitForCompletion(inputProducer, reducer); - } catch (Exception ex) { + } catch (Throwable ex) { +// logger.warn("Reduce job got exception " + ex); errorTracker.notifyOfError(ex); return initialValue; } @@ -486,16 +465,12 @@ public class NanoScheduler { final InputType input = inputWrapper.getValue(); // map - myNSRuntimeProfile.mapTimer.restart(); - final long preMapTime = LOG_MAP_TIMES ? 0 : myNSRuntimeProfile.mapTimer.currentTimeNano(); final MapType mapValue = map.apply(input); - if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (myNSRuntimeProfile.mapTimer.currentTimeNano() - preMapTime)); - myNSRuntimeProfile.mapTimer.stop(); // enqueue the result into the mapResultQueue result = new MapResult(mapValue, jobID); - if ( jobID % bufferSize == 0 && progressFunction != null ) + if ( progressFunction != null ) progressFunction.progress(input); } else { // push back the EOF marker so other waiting threads can read it @@ -508,7 +483,8 @@ public class NanoScheduler { mapResultQueue.put(result); final int nReduced = reducer.reduceAsMuchAsPossible(mapResultQueue); - } catch (Exception ex) { + } catch (Throwable ex) { +// logger.warn("Map job got exception " + ex); errorTracker.notifyOfError(ex); } finally { // we finished a map job, release the job queue semaphore diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java index 92c1018eb..5cae28187 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java @@ -4,7 +4,6 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; -import org.broadinstitute.sting.utils.SimpleTimer; import java.util.concurrent.CountDownLatch; import java.util.concurrent.PriorityBlockingQueue; @@ -34,7 +33,6 @@ class Reducer { final CountDownLatch countDownLatch = new CountDownLatch(1); final NSReduceFunction reduce; - final SimpleTimer reduceTimer; final MultiThreadedErrorTracker errorTracker; /** @@ -61,20 +59,16 @@ class Reducer { * reduceTimer * * @param reduce the reduce function to apply - * @param reduceTimer the timer to time the reduce function call * @param initialSum the initial reduce sum */ public Reducer(final NSReduceFunction reduce, final MultiThreadedErrorTracker errorTracker, - final SimpleTimer reduceTimer, final ReduceType initialSum) { if ( errorTracker == null ) throw new IllegalArgumentException("Error tracker cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null"); - if ( reduceTimer == null ) throw new IllegalArgumentException("reduceTimer cannot be null"); this.errorTracker = errorTracker; this.reduce = reduce; - this.reduceTimer = reduceTimer; this.sum = initialSum; } @@ -125,10 +119,7 @@ class Reducer { nReducesNow++; // apply reduce, keeping track of sum - reduceTimer.restart(); sum = reduce.apply(result.getValue(), sum); - reduceTimer.stop(); - } numJobsReduced++; diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java index a8715e242..b69283b9d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.progressmeter; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -143,6 +144,12 @@ public class ProgressMeter { /** We use the SimpleTimer to time our run */ private final SimpleTimer timer = new SimpleTimer(); + private GenomeLoc maxGenomeLoc = null; + private String positionMessage = "starting"; + private long nTotalRecordsProcessed = 0; + + final ProgressMeterDaemon progressMeterDaemon; + /** * Create a new ProgressMeter * @@ -177,21 +184,15 @@ public class ProgressMeter { targetSizeInBP = processingIntervals.coveredSize(); // start up the timer + progressMeterDaemon = new ProgressMeterDaemon(this); start(); } /** - * Forward request to notifyOfProgress - * - * Assumes that one cycle has been completed - * - * @param loc our current location. Null means "in unmapped reads" - * @param nTotalRecordsProcessed the total number of records we've processed + * Start up the progress meter, printing initialization message and starting up the + * daemon thread for periodic printing. */ - public void notifyOfProgress(final GenomeLoc loc, final long nTotalRecordsProcessed) { - notifyOfProgress(loc, false, nTotalRecordsProcessed); - } - + @Requires("progressMeterDaemon != null") private synchronized void start() { timer.start(); lastProgressPrintTime = timer.currentTime(); @@ -199,6 +200,8 @@ public class ProgressMeter { logger.info("[INITIALIZATION COMPLETE; STARTING PROCESSING]"); logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", "Location", processingUnitName, processingUnitName)); + + progressMeterDaemon.start(); } /** @@ -216,19 +219,41 @@ public class ProgressMeter { * Synchronized to ensure that even with multiple threads calling notifyOfProgress we still * get one clean stream of meter logs. * + * Note this thread doesn't actually print progress, unless must print is true, but just registers + * the progress itself. A separate printing daemon periodically polls the meter to print out + * progress + * * @param loc Current location, can be null if you are at the end of the processing unit - * @param mustPrint If true, will print out info, regardless of time interval * @param nTotalRecordsProcessed the total number of records we've processed */ - private synchronized void notifyOfProgress(final GenomeLoc loc, boolean mustPrint, final long nTotalRecordsProcessed) { + public synchronized void notifyOfProgress(final GenomeLoc loc, final long nTotalRecordsProcessed) { if ( nTotalRecordsProcessed < 0 ) throw new IllegalArgumentException("nTotalRecordsProcessed must be >= 0"); + // weird comparison to ensure that loc == null (in unmapped reads) is keep before maxGenomeLoc == null (on startup) + this.maxGenomeLoc = loc == null ? loc : (maxGenomeLoc == null ? loc : loc.max(maxGenomeLoc)); + this.nTotalRecordsProcessed = Math.max(this.nTotalRecordsProcessed, nTotalRecordsProcessed); + + // a pretty name for our position + this.positionMessage = maxGenomeLoc == null + ? "unmapped reads" + : String.format("%s:%d", maxGenomeLoc.getContig(), maxGenomeLoc.getStart()); + } + + /** + * Actually try to print out progress + * + * This function may print out if the progress print is due, but if not enough time has elapsed + * since the last print we will not print out information. + * + * @param mustPrint if true, progress will be printed regardless of the last time we printed progress + */ + protected synchronized void printProgress(final boolean mustPrint) { final long curTime = timer.currentTime(); final boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency); final boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); if ( printProgress || printLog ) { - final ProgressMeterData progressData = takeProgressSnapshot(loc, nTotalRecordsProcessed); + final ProgressMeterData progressData = takeProgressSnapshot(maxGenomeLoc, nTotalRecordsProcessed); final AutoFormattingTime elapsed = new AutoFormattingTime(progressData.getElapsedSeconds()); final AutoFormattingTime bpRate = new AutoFormattingTime(progressData.secondsPerMillionBP()); @@ -241,13 +266,8 @@ public class ProgressMeter { lastProgressPrintTime = curTime; updateLoggerPrintFrequency(estTotalRuntime.getTimeInSeconds()); - // a pretty name for our position - final String posName = loc == null - ? (mustPrint ? "done" : "unmapped reads") - : String.format("%s:%d", loc.getContig(), loc.getStart()); - logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", - posName, progressData.getUnitsProcessed()*1.0, elapsed, unitRate, + positionMessage, progressData.getUnitsProcessed()*1.0, elapsed, unitRate, 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); } @@ -296,13 +316,18 @@ public class ProgressMeter { */ public void notifyDone(final long nTotalRecordsProcessed) { // print out the progress meter - notifyOfProgress(null, true, nTotalRecordsProcessed); + this.nTotalRecordsProcessed = nTotalRecordsProcessed; + this.positionMessage = "done"; + printProgress(true); logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", timer.getElapsedTime(), timer.getElapsedTime() / 60, timer.getElapsedTime() / 3600)); if ( performanceLog != null ) performanceLog.close(); + + // shutdown our daemon thread + progressMeterDaemon.done(); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java new file mode 100644 index 000000000..16887400a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.utils.progressmeter; + +/** + * Daemon thread that periodically prints the progress of the progress meter + * + * User: depristo + * Date: 12/4/12 + * Time: 9:16 PM + */ +public final class ProgressMeterDaemon extends Thread { + /** + * How frequently should we poll and print progress? + */ + private final static long POLL_FREQUENCY_MILLISECONDS = 10 * 1000; + + /** + * Are we to continue periodically printing status, or should we shut down? + */ + boolean done = false; + + /** + * The meter we will call print on + */ + final ProgressMeter meter; + + /** + * Create a new ProgressMeterDaemon printing progress for meter + * @param meter the progress meter to print progress of + */ + public ProgressMeterDaemon(final ProgressMeter meter) { + if ( meter == null ) throw new IllegalArgumentException("meter cannot be null"); + this.meter = meter; + setDaemon(true); + setName("ProgressMeterDaemon"); + } + + /** + * Tells this daemon thread to shutdown at the next opportunity, as the progress + * metering is complete. + */ + public final void done() { + this.done = true; + } + + /** + * Start up the ProgressMeterDaemon, polling every tens of seconds to print, if + * necessary, the provided progress meter. Never exits until the JVM is complete, + * or done() is called, as the thread is a daemon thread + */ + public void run() { + while (! done) { + meter.printProgress(false); + try { + Thread.sleep(POLL_FREQUENCY_MILLISECONDS); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index b3e3cf8df..8b360eb5e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -448,11 +448,47 @@ public class VariantContextUtils { final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC ) { + int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); + return simpleMerge(genomeLocParser,unsortedVCs,priorityListOfVCs,originalNumOfVCs,filteredRecordMergeType,genotypeMergeOptions,annotateOrigin,printMessages,setKey,filteredAreUncalled,mergeInfoWithMaxAC); + } + + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name + * + * @param genomeLocParser loc parser + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, + final Collection unsortedVCs, + final List priorityListOfVCs, + final int originalNumOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; - if ( annotateOrigin && priorityListOfVCs == null ) - throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); + if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) + throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); + + if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) + throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) verifyUniqueSampleNames(unsortedVCs); @@ -597,7 +633,7 @@ public class VariantContextUtils { if ( annotateOrigin ) { // we care about where the call came from String setValue; - if ( nFiltered == 0 && variantSources.size() == priorityListOfVCs.size() ) // nothing was unfiltered + if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if ( nFiltered == VCs.size() ) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 9a987f161..974e50ced 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -248,7 +248,8 @@ class VCFWriter extends IndexingVariantContextWriter { } mWriter.write("\n"); - mWriter.flush(); // necessary so that writing to an output stream will work + // note that we cannot call flush here if we want block gzipping to work properly + // calling flush results in all gzipped blocks for each variant } catch (IOException e) { throw new RuntimeException("Unable to write the VCF object to " + getStreamName(), e); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 61c1c51b4..8186f69cf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -32,11 +32,10 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.walkers.qc.CountLoci; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -85,7 +84,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); - LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + LegacyLocusIteratorByState locusIteratorByState = new LegacyLocusIteratorByState(readIterator,readProperties,genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); while(locusIteratorByState.hasNext()) { locusIteratorByState.next().getLocation(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java similarity index 97% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java index 0807f36dc..73664141f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java @@ -45,10 +45,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -public class ExperimentalReadShardBalancerUnitTest extends BaseTest { +public class ReadShardBalancerUnitTest extends BaseTest { /** - * Tests to ensure that ExperimentalReadShardBalancer works as expected and does not place shard boundaries + * Tests to ensure that ReadShardBalancer works as expected and does not place shard boundaries * at inappropriate places, such as within an alignment start position */ private static class ExperimentalReadShardBalancerTest extends TestDataProvider { @@ -74,7 +74,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { this.stackSize = stackSize; this.numUnmappedReads = numUnmappedReads; - this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, false); this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads; setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", @@ -96,7 +96,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { new ArrayList(), false); - Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ExperimentalReadShardBalancer()); + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); SAMRecord readAtEndOfLastShard = null; int totalReadsSeen = 0; diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java similarity index 50% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java index a49a602c6..44d182661 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java @@ -7,10 +7,8 @@ import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -21,14 +19,17 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** - * testing of the experimental version of LocusIteratorByState + * testing of the LEGACY version of LocusIteratorByState */ -public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { +public class LegacyLocusIteratorByStateUnitTest extends BaseTest { private static SAMFileHeader header; - private LocusIteratorByStateExperimental li; + private LegacyLocusIteratorByState li; private GenomeLocParser genomeLocParser; @BeforeClass @@ -37,8 +38,8 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } - private LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { - return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); + private LegacyLocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + return new LegacyLocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); } @Test @@ -328,184 +329,14 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { // End comprehensive LIBS/PileupElement tests // //////////////////////////////////////////////// - - /////////////////////////////////////// - // Read State Manager Tests // - /////////////////////////////////////// - - private class PerSampleReadStateManagerTest extends TestDataProvider { - private List readCountsPerAlignmentStart; - private List reads; - private List> recordStatesByAlignmentStart; - private int removalInterval; - - public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { - super(PerSampleReadStateManagerTest.class); - - this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; - this.removalInterval = removalInterval; - - reads = new ArrayList(); - recordStatesByAlignmentStart = new ArrayList>(); - - setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", - getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); - } - - public void run() { - LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); - LocusIteratorByStateExperimental.ReadStateManager readStateManager = - libs.new ReadStateManager(new ArrayList().iterator()); - LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = - readStateManager.new PerSampleReadStateManager(); - - makeReads(); - - for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { - perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); - } - - // read state manager should have the right number of reads - Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); - - Iterator originalReadsIterator = reads.iterator(); - Iterator recordStateIterator = perSampleReadStateManager.iterator(); - int recordStateCount = 0; - int numReadStatesRemoved = 0; - - // Do a first-pass validation of the record state iteration by making sure we get back everything we - // put in, in the same order, doing any requested removals of read states along the way - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - recordStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - SAMRecord originalRead = originalReadsIterator.next(); - - // The read we get back should be literally the same read in memory as we put in - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - - // If requested, remove a read state every removalInterval states - if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { - recordStateIterator.remove(); - numReadStatesRemoved++; - } - } - - Assert.assertFalse(originalReadsIterator.hasNext()); - - // If we removed any read states, do a second pass through the read states to make sure the right - // states were removed - if ( numReadStatesRemoved > 0 ) { - Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); - - originalReadsIterator = reads.iterator(); - recordStateIterator = perSampleReadStateManager.iterator(); - int readCount = 0; - int readStateCount = 0; - - // Match record states with the reads that should remain after removal - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - readStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - - SAMRecord originalRead = originalReadsIterator.next(); - readCount++; - - if ( readCount % removalInterval == 0 ) { - originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded - readCount++; - } - - // The read we get back should be literally the same read in memory as we put in (after accounting for removals) - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - } - - Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); - } - - // Allow memory used by this test to be reclaimed - readCountsPerAlignmentStart = null; - reads = null; - recordStatesByAlignmentStart = null; - } - - private void makeReads() { - int alignmentStart = 1; - - for ( int readsThisStack : readCountsPerAlignmentStart ) { - ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); - ArrayList stackRecordStates = new ArrayList(); - - for ( SAMRecord read : stackReads ) { - stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); - } - - reads.addAll(stackReads); - recordStatesByAlignmentStart.add(stackRecordStates); - } - } - } - - @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") - public Object[][] createPerSampleReadStateManagerTests() { - for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), - Arrays.asList(2), - Arrays.asList(10), - Arrays.asList(1, 1), - Arrays.asList(2, 2), - Arrays.asList(10, 10), - Arrays.asList(1, 10), - Arrays.asList(10, 1), - Arrays.asList(1, 1, 1), - Arrays.asList(2, 2, 2), - Arrays.asList(10, 10, 10), - Arrays.asList(1, 1, 1, 1, 1, 1), - Arrays.asList(10, 10, 10, 10, 10, 10), - Arrays.asList(1, 2, 10, 1, 2, 10) - ) ) { - - for ( int removalInterval : Arrays.asList(0, 2, 3) ) { - new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); - } - } - - return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); - } - - @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") - public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { - logger.warn("Running test: " + test); - - test.run(); - } - - /////////////////////////////////////// - // End Read State Manager Tests // - /////////////////////////////////////// - - - - /////////////////////////////////////// - // Helper methods / classes // - /////////////////////////////////////// - private static ReadProperties createTestReadProperties() { - return createTestReadProperties(null); - } - - private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, - downsamplingMethod, + null, new ValidationExclusion(), Collections.emptyList(), Collections.emptyList(), @@ -513,136 +344,137 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { (byte) -1 ); } +} - private static class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; +class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; - public FakeCloseableIterator(Iterator it) { - iterator = it; - } - - @Override - public void close() {} - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public T next() { - return iterator.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } + public FakeCloseableIterator(Iterator it) { + iterator = it; } - private static final class LIBS_position { + @Override + public void close() {} - SAMRecord read; + @Override + public boolean hasNext() { + return iterator.hasNext(); + } - final int numOperators; - int currentOperatorIndex = 0; - int currentPositionOnOperator = 0; - int currentReadOffset = 0; + @Override + public T next() { + return iterator.next(); + } - boolean isBeforeDeletionStart = false; - boolean isBeforeDeletedBase = false; - boolean isAfterDeletionEnd = false; - boolean isAfterDeletedBase = false; - boolean isBeforeInsertion = false; - boolean isAfterInsertion = false; - boolean isNextToSoftClip = false; - - boolean sawMop = false; - - public LIBS_position(final SAMRecord read) { - this.read = read; - numOperators = read.getCigar().numCigarElements(); - } - - public int getCurrentReadOffset() { - return Math.max(0, currentReadOffset - 1); - } - - /** - * Steps forward on the genome. Returns false when done reading the read, true otherwise. - */ - public boolean stepForwardOnGenome() { - if ( currentOperatorIndex == numOperators ) - return false; - - CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); - if ( currentPositionOnOperator >= curElement.getLength() ) { - if ( ++currentOperatorIndex == numOperators ) - return false; - - curElement = read.getCigar().getCigarElement(currentOperatorIndex); - currentPositionOnOperator = 0; - } - - switch ( curElement.getOperator() ) { - case I: // insertion w.r.t. the reference - if ( !sawMop ) - break; - case S: // soft clip - currentReadOffset += curElement.getLength(); - case H: // hard clip - case P: // padding - currentOperatorIndex++; - return stepForwardOnGenome(); - - case D: // deletion w.r.t. the reference - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - currentPositionOnOperator++; - break; - - case M: - case EQ: - case X: - sawMop = true; - currentReadOffset++; - currentPositionOnOperator++; - break; - default: - throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); - } - - final boolean isFirstOp = currentOperatorIndex == 0; - final boolean isLastOp = currentOperatorIndex == numOperators - 1; - final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; - final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); - - isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); - isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); - isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); - isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); - isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) - || (!sawMop && curElement.getOperator() == CigarOperator.I); - isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); - isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) - || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); - - return true; - } - - private static boolean isBeforeOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isLastOp, - final boolean isLastBaseOfOp) { - return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; - } - - private static boolean isAfterOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isFirstOp, - final boolean isFirstBaseOfOp) { - return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; - } + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } +} + + +final class LIBS_position { + + SAMRecord read; + + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; + + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + + boolean sawMop = false; + + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } + + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } + + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) + return false; + + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; + + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; + break; + + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } + + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; + } + + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } + + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 83913fa76..4e4126ad4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -7,8 +7,10 @@ import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -19,13 +21,10 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.Arrays; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; +import java.util.*; /** - * testing of the LocusIteratorByState + * testing of the new (non-legacy) version of LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends BaseTest { private static SAMFileHeader header; @@ -329,14 +328,184 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // End comprehensive LIBS/PileupElement tests // //////////////////////////////////////////////// + + /////////////////////////////////////// + // Read State Manager Tests // + /////////////////////////////////////// + + private class PerSampleReadStateManagerTest extends TestDataProvider { + private List readCountsPerAlignmentStart; + private List reads; + private List> recordStatesByAlignmentStart; + private int removalInterval; + + public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { + super(PerSampleReadStateManagerTest.class); + + this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; + this.removalInterval = removalInterval; + + reads = new ArrayList(); + recordStatesByAlignmentStart = new ArrayList>(); + + setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", + getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); + } + + public void run() { + LocusIteratorByState libs = makeLTBS(new ArrayList(), createTestReadProperties()); + LocusIteratorByState.ReadStateManager readStateManager = + libs.new ReadStateManager(new ArrayList().iterator()); + LocusIteratorByState.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = + readStateManager.new PerSampleReadStateManager(); + + makeReads(); + + for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { + perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + } + + // read state manager should have the right number of reads + Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); + + Iterator originalReadsIterator = reads.iterator(); + Iterator recordStateIterator = perSampleReadStateManager.iterator(); + int recordStateCount = 0; + int numReadStatesRemoved = 0; + + // Do a first-pass validation of the record state iteration by making sure we get back everything we + // put in, in the same order, doing any requested removals of read states along the way + while ( recordStateIterator.hasNext() ) { + LocusIteratorByState.SAMRecordState readState = recordStateIterator.next(); + recordStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + SAMRecord originalRead = originalReadsIterator.next(); + + // The read we get back should be literally the same read in memory as we put in + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + + // If requested, remove a read state every removalInterval states + if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { + recordStateIterator.remove(); + numReadStatesRemoved++; + } + } + + Assert.assertFalse(originalReadsIterator.hasNext()); + + // If we removed any read states, do a second pass through the read states to make sure the right + // states were removed + if ( numReadStatesRemoved > 0 ) { + Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); + + originalReadsIterator = reads.iterator(); + recordStateIterator = perSampleReadStateManager.iterator(); + int readCount = 0; + int readStateCount = 0; + + // Match record states with the reads that should remain after removal + while ( recordStateIterator.hasNext() ) { + LocusIteratorByState.SAMRecordState readState = recordStateIterator.next(); + readStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + + SAMRecord originalRead = originalReadsIterator.next(); + readCount++; + + if ( readCount % removalInterval == 0 ) { + originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded + readCount++; + } + + // The read we get back should be literally the same read in memory as we put in (after accounting for removals) + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + } + + Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); + } + + // Allow memory used by this test to be reclaimed + readCountsPerAlignmentStart = null; + reads = null; + recordStatesByAlignmentStart = null; + } + + private void makeReads() { + int alignmentStart = 1; + + for ( int readsThisStack : readCountsPerAlignmentStart ) { + ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); + ArrayList stackRecordStates = new ArrayList(); + + for ( SAMRecord read : stackReads ) { + stackRecordStates.add(new LocusIteratorByState.SAMRecordState(read)); + } + + reads.addAll(stackReads); + recordStatesByAlignmentStart.add(stackRecordStates); + } + } + } + + @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") + public Object[][] createPerSampleReadStateManagerTests() { + for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), + Arrays.asList(2), + Arrays.asList(10), + Arrays.asList(1, 1), + Arrays.asList(2, 2), + Arrays.asList(10, 10), + Arrays.asList(1, 10), + Arrays.asList(10, 1), + Arrays.asList(1, 1, 1), + Arrays.asList(2, 2, 2), + Arrays.asList(10, 10, 10), + Arrays.asList(1, 1, 1, 1, 1, 1), + Arrays.asList(10, 10, 10, 10, 10, 10), + Arrays.asList(1, 2, 10, 1, 2, 10) + ) ) { + + for ( int removalInterval : Arrays.asList(0, 2, 3) ) { + new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); + } + } + + return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); + } + + @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") + public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } + + /////////////////////////////////////// + // End Read State Manager Tests // + /////////////////////////////////////// + + + + /////////////////////////////////////// + // Helper methods / classes // + /////////////////////////////////////// + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, - null, + downsamplingMethod, new ValidationExclusion(), Collections.emptyList(), Collections.emptyList(), @@ -344,137 +513,136 @@ public class LocusIteratorByStateUnitTest extends BaseTest { (byte) -1 ); } -} -class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; - public FakeCloseableIterator(Iterator it) { - iterator = it; + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() {} + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } } - @Override - public void close() {} + private static final class LIBS_position { - @Override - public boolean hasNext() { - return iterator.hasNext(); - } + SAMRecord read; - @Override - public T next() { - return iterator.next(); - } + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } -} + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + boolean sawMop = false; -final class LIBS_position { + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } - SAMRecord read; + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } - final int numOperators; - int currentOperatorIndex = 0; - int currentPositionOnOperator = 0; - int currentReadOffset = 0; - - boolean isBeforeDeletionStart = false; - boolean isBeforeDeletedBase = false; - boolean isAfterDeletionEnd = false; - boolean isAfterDeletedBase = false; - boolean isBeforeInsertion = false; - boolean isAfterInsertion = false; - boolean isNextToSoftClip = false; - - boolean sawMop = false; - - public LIBS_position(final SAMRecord read) { - this.read = read; - numOperators = read.getCigar().numCigarElements(); - } - - public int getCurrentReadOffset() { - return Math.max(0, currentReadOffset - 1); - } - - /** - * Steps forward on the genome. Returns false when done reading the read, true otherwise. - */ - public boolean stepForwardOnGenome() { - if ( currentOperatorIndex == numOperators ) - return false; - - CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); - if ( currentPositionOnOperator >= curElement.getLength() ) { - if ( ++currentOperatorIndex == numOperators ) + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) return false; - curElement = read.getCigar().getCigarElement(currentOperatorIndex); - currentPositionOnOperator = 0; - } + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; - switch ( curElement.getOperator() ) { - case I: // insertion w.r.t. the reference - if ( !sawMop ) + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; break; - case S: // soft clip - currentReadOffset += curElement.getLength(); - case H: // hard clip - case P: // padding - currentOperatorIndex++; - return stepForwardOnGenome(); - case D: // deletion w.r.t. the reference - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - currentPositionOnOperator++; - break; + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } - case M: - case EQ: - case X: - sawMop = true; - currentReadOffset++; - currentPositionOnOperator++; - break; - default: - throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; } - final boolean isFirstOp = currentOperatorIndex == 0; - final boolean isLastOp = currentOperatorIndex == numOperators - 1; - final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; - final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } - isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); - isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); - isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); - isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); - isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) - || (!sawMop && curElement.getOperator() == CigarOperator.I); - isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); - isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) - || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); - - return true; - } - - private static boolean isBeforeOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isLastOp, - final boolean isLastBaseOfOp) { - return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; - } - - private static boolean isAfterOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isFirstOp, - final boolean isFirstBaseOfOp) { - return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index a65b0cb45..69907d485 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -109,12 +109,16 @@ public class TraverseActiveRegionsTest extends BaseTest { dictionary = reference.getSequenceDictionary(); genomeLocParser = new GenomeLocParser(dictionary); + // TODO: test shard boundaries + intervals = new ArrayList(); intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + intervals.add(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); @@ -126,6 +130,7 @@ public class TraverseActiveRegionsTest extends BaseTest { reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + reads.add(buildSAMRecord("end_of_chr1", "1", 249250600, 249250700)); reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); // required by LocusIteratorByState, and I prefer to list them in test case order above @@ -139,8 +144,6 @@ public class TraverseActiveRegionsTest extends BaseTest { List activeIntervals = getIsActiveIntervals(walker, intervals); // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call verifyEqualIntervals(intervals, activeIntervals); - - // TODO: more tests and edge cases } private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { @@ -171,8 +174,6 @@ public class TraverseActiveRegionsTest extends BaseTest { Collection activeRegions = getActiveRegions(walker, intervals).values(); verifyActiveRegionCoverage(intervals, activeRegions); - - // TODO: more tests and edge cases } private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { @@ -231,6 +232,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -245,6 +247,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -256,6 +259,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -267,6 +271,19 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -278,9 +295,8 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); - - // TODO: more tests and edge cases } @Test @@ -300,6 +316,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -314,6 +331,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -325,6 +343,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -336,6 +355,19 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -347,9 +379,8 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); - - // TODO: more tests and edge cases } @Test @@ -370,6 +401,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -384,6 +416,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -395,6 +428,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -406,6 +440,19 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -417,9 +464,13 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); + } - // TODO: more tests and edge cases + @Test + public void testUnmappedReads() { + // TODO } private void verifyReadNotPlaced(ActiveRegion region, String readName) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index bf1fc9e65..d9af2ea7e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; +import org.broadinstitute.sting.gatk.datasources.reads.LegacyReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; @@ -114,7 +114,7 @@ public class TraverseReadsUnitTest extends BaseTest { @Test public void testUnmappedReadCount() { SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); - Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new LegacyReadShardBalancer()); countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsLargeScaleTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsLargeScaleTest.java new file mode 100755 index 000000000..ad7ac56f9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsLargeScaleTest.java @@ -0,0 +1,20 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.ArrayList; + +public class PrintReadsLargeScaleTest extends WalkerTest { + @Test( timeOut = 1000 * 60 * 60 * 20 ) // 60 seconds * 60 seconds / minute * 20 minutes + public void testRealignerTargetCreator() { + WalkerTestSpec spec = new WalkerTestSpec( + "-R " + b37KGReference + + " -T PrintReads" + + " -I " + evaluationDataLocation + "CEUTrio.HiSeq.WEx.b37.NA12892.clean.dedup.recal.1.bam" + + " -o /dev/null", + 0, + new ArrayList(0)); + executeTest("testPrintReadsWholeExomeChr1", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index c32d77f82..28b176d4b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -137,7 +137,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("e5f0e7a80cd392172ebf5ddb06b91a00")); + Arrays.asList("58e6281df108c361e99673a501ee4749")); cvExecuteTest("threeWayWithRefs", spec, true); } diff --git a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java index 5b052454a..47096c73a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java @@ -23,14 +23,14 @@ public class LegacyReservoirDownsamplerUnitTest { @Test public void testEmptyIterator() { - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); Assert.assertTrue(downsampler.isEmpty(),"Downsampler is not empty but should be."); } @Test public void testOneElementWithPoolSizeOne() { List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -42,7 +42,7 @@ public class LegacyReservoirDownsamplerUnitTest { @Test public void testOneElementWithPoolSizeGreaterThanOne() { List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -58,7 +58,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -78,7 +78,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -99,7 +99,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(0); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(0); downsampler.addAll(reads); Assert.assertTrue(downsampler.isEmpty(),"Downsampler isn't empty but should be"); @@ -115,7 +115,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -128,7 +128,7 @@ public class LegacyReservoirDownsamplerUnitTest { public void testFillingAcrossLoci() { List reads = new ArrayList(); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java index 6c59f1585..489adab6b 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; -import org.broadinstitute.sting.utils.SimpleTimer; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -46,7 +45,7 @@ public class InputProducerUnitTest extends BaseTest { final LinkedBlockingDeque.InputValue> readQueue = new LinkedBlockingDeque.InputValue>(queueSize); - final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), new SimpleTimer(), readQueue); + final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), readQueue); final ExecutorService es = Executors.newSingleThreadExecutor(); @@ -94,7 +93,7 @@ public class InputProducerUnitTest extends BaseTest { final LinkedBlockingDeque.InputValue> readQueue = new LinkedBlockingDeque.InputValue>(); - final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), new SimpleTimer(), readQueue); + final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), readQueue); final ExecutorService es = Executors.newSingleThreadExecutor(); es.submit(ip); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index af2e18ad9..61e8ec0a1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -188,17 +188,6 @@ public class NanoSchedulerUnitTest extends BaseTest { Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected. Expected at least " + test.nExpectedCallbacks() + " but saw only " + callback.callBacks); nanoScheduler.shutdown(); - - // TODO -- need to enable only in the case where there's serious time spend in - // TODO -- read /map / reduce, otherwise the "outside" timer doesn't add up - final double myTimeEstimate = timer.getElapsedTime(); - final double tolerance = 0.1; - if ( false && myTimeEstimate > 0.1 ) { - Assert.assertTrue(nanoScheduler.getTotalRuntime() > myTimeEstimate * tolerance, - "NanoScheduler said that the total runtime was " + nanoScheduler.getTotalRuntime() - + " but the overall test time was " + myTimeEstimate + ", beyond our tolerance factor of " - + tolerance); - } } @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) @@ -243,7 +232,7 @@ public class NanoSchedulerUnitTest extends BaseTest { for ( final int nThreads : Arrays.asList(8) ) { for ( final boolean addDelays : Arrays.asList(true, false) ) { final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(bufSize, nThreads, 1, 1000000, false); - final int maxN = addDelays ? 10000 : 100000; + final int maxN = addDelays ? 1000 : 10000; for ( int nElementsBeforeError = 0; nElementsBeforeError < maxN; nElementsBeforeError += Math.max(nElementsBeforeError / 10, 1) ) { tests.add(new Object[]{nElementsBeforeError, test, addDelays}); } @@ -259,17 +248,22 @@ public class NanoSchedulerUnitTest extends BaseTest { executeTestErrorThrowingInput(10, new NullPointerException(), exampleTest, false); } - @Test(enabled = true, expectedExceptions = ReviewedStingException.class, timeOut = 10000) + @Test(enabled = true, expectedExceptions = ReviewedStingException.class, timeOut = 1000) public void testInputErrorIsThrown_RSE() throws InterruptedException { executeTestErrorThrowingInput(10, new ReviewedStingException("test"), exampleTest, false); } - @Test(enabled = true, expectedExceptions = NullPointerException.class, dataProvider = "NanoSchedulerInputExceptionTest", timeOut = 10000, invocationCount = 1) - public void testInputErrorDoesntDeadlock(final int nElementsBeforeError, final NanoSchedulerBasicTest test, final boolean addDelays ) throws InterruptedException { + @Test(enabled = true, expectedExceptions = NullPointerException.class, dataProvider = "NanoSchedulerInputExceptionTest", timeOut = 1000, invocationCount = 1) + public void testInputRuntimeExceptionDoesntDeadlock(final int nElementsBeforeError, final NanoSchedulerBasicTest test, final boolean addDelays ) throws InterruptedException { executeTestErrorThrowingInput(nElementsBeforeError, new NullPointerException(), test, addDelays); } - private void executeTestErrorThrowingInput(final int nElementsBeforeError, final RuntimeException ex, final NanoSchedulerBasicTest test, final boolean addDelays) { + @Test(enabled = true, expectedExceptions = ReviewedStingException.class, dataProvider = "NanoSchedulerInputExceptionTest", timeOut = 1000, invocationCount = 1) + public void testInputErrorDoesntDeadlock(final int nElementsBeforeError, final NanoSchedulerBasicTest test, final boolean addDelays ) throws InterruptedException { + executeTestErrorThrowingInput(nElementsBeforeError, new Error(), test, addDelays); + } + + private void executeTestErrorThrowingInput(final int nElementsBeforeError, final Throwable ex, final NanoSchedulerBasicTest test, final boolean addDelays) { logger.warn("executeTestErrorThrowingInput " + nElementsBeforeError + " ex=" + ex + " test=" + test + " addInputDelays=" + addDelays); final NanoScheduler nanoScheduler = test.makeScheduler(); nanoScheduler.execute(new ErrorThrowingIterator(nElementsBeforeError, ex, addDelays), test.makeMap(), test.initReduce(), test.makeReduce()); @@ -279,9 +273,9 @@ public class NanoSchedulerUnitTest extends BaseTest { final int nElementsBeforeError; final boolean addDelays; int i = 0; - final RuntimeException ex; + final Throwable ex; - private ErrorThrowingIterator(final int nElementsBeforeError, RuntimeException ex, boolean addDelays) { + private ErrorThrowingIterator(final int nElementsBeforeError, Throwable ex, boolean addDelays) { this.nElementsBeforeError = nElementsBeforeError; this.ex = ex; this.addDelays = addDelays; @@ -290,7 +284,12 @@ public class NanoSchedulerUnitTest extends BaseTest { @Override public boolean hasNext() { return true; } @Override public Integer next() { if ( i++ > nElementsBeforeError ) { - throw ex; + if ( ex instanceof Error ) + throw (Error)ex; + else if ( ex instanceof RuntimeException ) + throw (RuntimeException)ex; + else + throw new RuntimeException("Bad exception " + ex); } else if ( addDelays ) { maybeDelayMe(i); return i; diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java index 39133d1ed..6c17aa78d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; -import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -93,7 +92,7 @@ public class ReducerUnitTest extends BaseTest { final List>> jobGroups = Utils.groupList(allJobs, groupSize); final ReduceSumTest reduce = new ReduceSumTest(); - final Reducer reducer = new Reducer(reduce, new MultiThreadedErrorTracker(), new SimpleTimer(), 0); + final Reducer reducer = new Reducer(reduce, new MultiThreadedErrorTracker(), 0); final TestWaitingForFinalReduce waitingThread = new TestWaitingForFinalReduce(reducer, expectedSum(allJobs)); final ExecutorService es = Executors.newSingleThreadExecutor(); @@ -155,7 +154,7 @@ public class ReducerUnitTest extends BaseTest { private void runSettingJobIDTwice() throws Exception { final PriorityBlockingQueue> mapResultsQueue = new PriorityBlockingQueue>(); - final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), new SimpleTimer(), 0); + final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), 0); reducer.setTotalJobCount(10); reducer.setTotalJobCount(15);