diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index a61614481..94f6ff649 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -56,11 +56,14 @@ public class AlleleBiasedDownsamplingUtils { for ( int i = 0; i < 4; i++ ) alleleStratifiedElements[i] = new ArrayList(); + // keep all of the reduced reads + final ArrayList reducedReadPileups = new ArrayList(); + // start by stratifying the reads by the alleles they represent at this position for( final PileupElement pe : pileup ) { - // abort if we have a reduced read - we do not want to remove it! + // we do not want to remove a reduced read if ( pe.getRead().isReducedRead() ) - return pileup; + reducedReadPileups.add(pe); final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); if ( baseIndex != -1 ) @@ -76,6 +79,7 @@ public class AlleleBiasedDownsamplingUtils { return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); } }); + elementsToKeep.addAll(reducedReadPileups); // make a listing of allele counts final int[] alleleCounts = new int[4]; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 629a27c48..2061c5364 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -85,7 +85,7 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.INTERVAL) +@PartitionBy(PartitionType.CONTIG) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) public class ReduceReads extends ReadWalker, ReduceReadsStash> { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java index e9ed6b153..0a3512aa6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java @@ -240,8 +240,8 @@ public class AFCalcPerformanceTest { if ( a.isNonReference() ) { final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : ""; logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE); - final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; - logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost); + final String warningmePost = call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) == 0 && result.getLog10PosteriorOfAFEq0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; + logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFEq0ForAllele(a) + warningmePost); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index fee6c86f8..4d81d0010 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -33,7 +33,6 @@ import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -449,10 +448,10 @@ public class GenotypingEngine { final ArrayList undeterminedHaplotypes = new ArrayList(haplotypes.size()); for( final Haplotype h : haplotypes ) { - if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype - alleleMapper.get(refAllele).add(h); - } else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { + if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { alleleMapper.get(h.getArtificialAllele()).add(h); + } else if( h.getEventMap().get(loc) == null ) { // no event at this location so let's investigate later + undeterminedHaplotypes.add(h); } else { boolean haplotypeIsDetermined = false; for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 73bc8fba6..f26194e00 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -80,11 +80,11 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","dd568dc30be90135a3a8957a45a7321c"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "bf793c43b635a931207170be8035b288"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7459d131b..f2b2dfb7d 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("d20c7a143b899f0239bf64b652ad3edb")); + Arrays.asList("97df6c2a8d390d43b9bdf56c979d9b09")); executeTest("test Multiple SNP alleles", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("fb204e821a24d03bd3a671b6e01c449a")); + Arrays.asList("935ee705ffe8cc6bf1d9efcceea271c8")); executeTest("test mismatched PLs", spec); } @@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - Arrays.asList("32f18ba50406cd8c8069ba07f2f89558")); + Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6")); executeTest("test calling on reads with Ns in CIGAR", spec); } @@ -451,13 +451,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("c1077662411164182c5f75478344f83d")); + Arrays.asList("092e42a712afb660ec79ff11c55933e2")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "dee6590e3b7079890bc3a9cb372c297e"); + testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f"); } @Test diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index 96e055e92..016926e12 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -83,8 +83,8 @@ public class AFCalcResultUnitTest extends BaseTest { List tests = new ArrayList(); final List pValues = new LinkedList(); - for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999) ) - for ( final double espilon : Arrays.asList(-1e-5, 0.0, 1e-5) ) + for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999, 1 - 1e-4, 1 - 1e-5, 1 - 1e-6) ) + for ( final double espilon : Arrays.asList(-1e-7, 0.0, 1e-7) ) pValues.add(p + espilon); for ( final double pNonRef : pValues ) { @@ -106,7 +106,7 @@ public class AFCalcResultUnitTest extends BaseTest { alleles, MathUtils.normalizeFromLog10(new double[]{1 - pNonRef, pNonRef}, true, false), log10Even, - Collections.singletonMap(C, Math.log10(pNonRef))); + Collections.singletonMap(C, Math.log10(1 - pNonRef))); } @Test(enabled = true, dataProvider = "TestIsPolymorphic") diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 2d346e548..7ee909fe0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -681,7 +681,7 @@ public class AFCalcUnitTest extends BaseTest { // must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt)); - Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt)); + Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFEq0ForAllele(alt)); } } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java index 391c99990..663471106 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -148,7 +148,7 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { for ( int i = 0; i < log10LAlleles.size(); i++ ) { final double log10LAllele1 = log10LAlleles.get(i); final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true); - final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0)); + final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, -10000.0)); originalPriors.add(result1); pNonRefN.add(log10pNonRef*(i+1)); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index e9c1ec605..a80137c27 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -33,7 +33,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "e2b3bf420c45c677956a2e4a56d75ea2"); + "fe84caa79f59ecbd98fcbcd5b30ab164"); } private void HCTestComplexVariants(String bam, String args, String md5) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b7000e0ee..1187039bb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -445,13 +445,17 @@ public class GenomeAnalysisEngine { protected DownsamplingMethod getDownsamplingMethod() { GATKArgumentCollection argCollection = this.getArguments(); - boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling; + + // Legacy downsampler can only be selected via the command line, not via walker annotations + boolean useLegacyDownsampler = argCollection.useLegacyDownsampler; DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); - DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling); - DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useLegacyDownsampler); + DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useLegacyDownsampler); - return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); + DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); + method.checkCompatibilityWithWalker(walker); + return method; } protected void setDownsamplingMethod(DownsamplingMethod method) { @@ -580,9 +584,9 @@ public class GenomeAnalysisEngine { throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); } - // Use the experimental ReadShardBalancer if experimental downsampling is enabled - ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ? - new ExperimentalReadShardBalancer() : + // Use the legacy ReadShardBalancer if legacy downsampling is enabled + ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useLegacyDownsampler ? + new LegacyReadShardBalancer() : new ReadShardBalancer(); if(intervals == null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index 28b5f918d..1a9162e51 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -305,11 +305,23 @@ public class WalkerManager extends PluginManager { * Gets the type of downsampling method requested by the walker. If an alternative * downsampling method is specified on the command-line, the command-line version will * be used instead. - * @param walkerClass The class of the walker to interrogate. - * @param useExperimentalDownsampling If true, use the experimental downsampling implementation + * @param walker The walker to interrogate. + * @param useLegacyDownsampler If true, use the legacy downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useExperimentalDownsampling) { + public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useLegacyDownsampler) { + return getDownsamplingMethod(walker.getClass(), useLegacyDownsampler); + } + + /** + * Gets the type of downsampling method requested by the walker. If an alternative + * downsampling method is specified on the command-line, the command-line version will + * be used instead. + * @param walkerClass The class of the walker to interrogate. + * @param useLegacyDownsampler If true, use the legacy downsampling implementation + * @return The downsampling method, as specified by the walker. Null if none exists. + */ + public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useLegacyDownsampler) { DownsamplingMethod downsamplingMethod = null; if( walkerClass.isAnnotationPresent(Downsample.class) ) { @@ -317,7 +329,7 @@ public class WalkerManager extends PluginManager { DownsampleType type = downsampleParameters.by(); Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; - downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling); + downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useLegacyDownsampler); } return downsamplingMethod; @@ -331,18 +343,6 @@ public class WalkerManager extends PluginManager { return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); } - /** - * Gets the type of downsampling method requested by the walker. If an alternative - * downsampling method is specified on the command-line, the command-line version will - * be used instead. - * @param walker The walker to interrogate. - * @param useExperimentalDownsampling If true, use the experimental downsampling implementation - * @return The downsampling method, as specified by the walker. Null if none exists. - */ - public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) { - return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling); - } - /** * Create a name for this type of walker. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index d0f3e91e0..d9c7c9008 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -162,12 +162,11 @@ public class GATKArgumentCollection { @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; - @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false) + @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.", required = false) public Integer downsampleCoverage = null; - @Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false) - @Hidden - public boolean enableExperimentalDownsampling = false; + @Argument(fullName = "use_legacy_downsampler", shortName = "use_legacy_downsampler", doc = "Use the legacy downsampling implementation instead of the newer, less-tested implementation", required = false) + public boolean useLegacyDownsampler = false; /** * Gets the downsampling method explicitly specified by the user. If the user didn't specify @@ -178,7 +177,7 @@ public class GATKArgumentCollection { if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) return null; - return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling); + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, useLegacyDownsampler); } /** @@ -192,7 +191,7 @@ public class GATKArgumentCollection { downsamplingType = method.type; downsampleCoverage = method.toCoverage; downsampleFraction = method.toFraction; - enableExperimentalDownsampling = method.useExperimentalDownsampling; + useLegacyDownsampler = method.useLegacyDownsampler; } // -------------------------------------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index cd3403f2f..c12bce208 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -136,11 +136,12 @@ public abstract class LocusView extends LocusIterator implements View { // Cache the current and apply filtering. AlignmentContext current = nextLocus; - // The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling: - if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling && - sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) { + // The old ALL_READS downsampling implementation -- use only if legacy downsampling was requested: + if ( sourceInfo.getDownsamplingMethod().useLegacyDownsampler && + sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && + sourceInfo.getDownsamplingMethod().toCoverage != null ) { - current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); + current.downsampleToCoverage(sourceInfo.getDownsamplingMethod().toCoverage); } // Indicate that the next operation will need to advance. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 8ee7e0439..cb33c5ab8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -134,12 +134,11 @@ public class BAMScheduler implements Iterator { // Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling // TODO: clean this up once the experimental downsampling engine fork collapses - if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) { - currentPosition = dataSource.getInitialReaderPositions(); + if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useLegacyDownsampler ) { + currentPosition = dataSource.getCurrentPosition(); } else { - currentPosition = dataSource.getCurrentPosition(); - + currentPosition = dataSource.getInitialReaderPositions(); } for(SAMReaderID reader: dataSource.getReaderIDs()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java deleted file mode 100644 index 0440c7eae..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards. - * - * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig - * together into one monolithic FilePointer, create one persistent set of read iterators over - * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to - * fill read shards with reads. - * - * This strategy has several important advantages: - * - * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole - * contig will have regions that overlap with other FilePointers on the same contig, due - * to the limited granularity of BAM index data. By creating only one FilePointer per contig, - * we avoid having to track how much of each file region we've visited (as we did in the - * former implementation), we avoid expensive non-sequential access patterns in the files, - * and we avoid having to repeatedly re-create our iterator chain for every small region - * of interest. - * - * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single - * persistent set of read iterators (which include the downsampling iterator(s)) per contig, - * the downsampling process is never interrupted by FilePointer or Shard boundaries, and never - * loses crucial state information while downsampling within a contig. - * - * TODO: There is also at least one important disadvantage: - * - * 1. We load more BAM index data into memory at once, and this work is done upfront before processing - * the next contig, creating a delay before traversal of each contig. This delay may be - * compensated for by the gains listed in #1 above, and we may be no worse off overall in - * terms of total runtime, but we need to verify this empirically. - * - * @author David Roazen - */ -public class ExperimentalReadShardBalancer extends ShardBalancer { - - private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class); - - /** - * Convert iterators of file pointers into balanced iterators of shards. - * @return An iterator over balanced shards. - */ - public Iterator iterator() { - return new Iterator() { - /** - * The cached shard to be returned next. Prefetched in the peekable iterator style. - */ - private Shard nextShard = null; - - /** - * The file pointer currently being processed. - */ - private FilePointer currentContigFilePointer = null; - - /** - * Iterator over the reads from the current contig's file pointer. The same iterator will be - * used to fill all shards associated with a given file pointer - */ - private PeekableIterator currentContigReadsIterator = null; - - /** - * How many FilePointers have we pulled from the filePointers iterator? - */ - private int totalFilePointersConsumed = 0; - - /** - * Have we encountered a monolithic FilePointer? - */ - private boolean encounteredMonolithicFilePointer = false; - - - { - createNextContigFilePointer(); - advance(); - } - - public boolean hasNext() { - return nextShard != null; - } - - public Shard next() { - if ( ! hasNext() ) - throw new NoSuchElementException("No next read shard available"); - Shard currentShard = nextShard; - advance(); - return currentShard; - } - - private void advance() { - nextShard = null; - - // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away - while ( nextShard == null && currentContigFilePointer != null ) { - - // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): - if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) { - - // Close the old, exhausted chain of iterators to release resources - currentContigReadsIterator.close(); - - // Advance to the FilePointer for the next contig - createNextContigFilePointer(); - - // We'll need to create a fresh iterator for this file pointer when we create the first - // shard for it below. - currentContigReadsIterator = null; - } - - // At this point our currentContigReadsIterator may be null or non-null depending on whether or not - // this is our first shard for this file pointer. - if ( currentContigFilePointer != null ) { - Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped); - - // Create a new reads iterator only when we've just advanced to the file pointer for the next - // contig. It's essential that the iterators persist across all shards that share the same contig - // to allow the downsampling to work properly. - if ( currentContigReadsIterator == null ) { - currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); - } - - if ( currentContigReadsIterator.hasNext() ) { - shard.fill(currentContigReadsIterator); - nextShard = shard; - } - } - } - } - - /** - * Aggregate all FilePointers for the next contig together into one monolithic FilePointer - * to avoid boundary issues with visiting the same file regions more than once (since more - * granular FilePointers will have regions that overlap with other nearby FilePointers due - * to the nature of BAM indices). - * - * By creating one persistent set of iterators per contig we also avoid boundary artifacts - * in the engine-level downsampling. - * - * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for - * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely - * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should - * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for - * TODO: locus traversals). - */ - private void createNextContigFilePointer() { - currentContigFilePointer = null; - List nextContigFilePointers = new ArrayList(); - - logger.info("Loading BAM index data for next contig"); - - while ( filePointers.hasNext() ) { - - // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that - // it is the ONLY FilePointer we ever encounter - if ( encounteredMonolithicFilePointer ) { - throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); - } - if ( filePointers.peek().isMonolithic() ) { - if ( totalFilePointersConsumed > 0 ) { - throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); - } - encounteredMonolithicFilePointer = true; - logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); - } - - // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the - // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge - if ( nextContigFilePointers.isEmpty() || - (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped && - nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) || - (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { - - nextContigFilePointers.add(filePointers.next()); - totalFilePointersConsumed++; - } - else { - break; // next FilePointer is on a different contig or has different mapped/unmapped status, - // save it for next time - } - } - - if ( ! nextContigFilePointers.isEmpty() ) { - currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser); - } - - if ( currentContigFilePointer != null ) { - logger.info("Done loading BAM index data for next contig"); - logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer)); - } - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); - } - }; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java new file mode 100644 index 000000000..f5b4fba8e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.SAMFileSpan; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Divide up large file pointers containing reads into more manageable subcomponents. + * + * TODO: delete this class once the experimental downsampling engine fork collapses + */ +public class LegacyReadShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + /** + * The cached shard to be returned next. Prefetched in the peekable iterator style. + */ + private Shard nextShard = null; + + /** + * The file pointer currently being processed. + */ + private FilePointer currentFilePointer; + + /** + * Ending position of the last shard in the file. + */ + private Map position = readsDataSource.getCurrentPosition(); + + { + if(filePointers.hasNext()) + currentFilePointer = filePointers.next(); + advance(); + } + + public boolean hasNext() { + return nextShard != null; + } + + public Shard next() { + if(!hasNext()) + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + + private void advance() { + Map shardPosition; + nextShard = null; + + Map selectedReaders = new HashMap(); + while(selectedReaders.size() == 0 && currentFilePointer != null) { + shardPosition = currentFilePointer.fileSpans; + + for(SAMReaderID id: shardPosition.keySet()) { + SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); + selectedReaders.put(id,fileSpan); + } + + if(!isEmpty(selectedReaders)) { + Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + readsDataSource.fillShard(shard); + + if(!shard.isBufferEmpty()) { + nextShard = shard; + break; + } + } + + selectedReaders.clear(); + currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; + } + + position = readsDataSource.getCurrentPosition(); + } + + /** + * Detects whether the list of file spans contain any read data. + * @param selectedSpans Mapping of readers to file spans. + * @return True if file spans are completely empty; false otherwise. + */ + private boolean isEmpty(Map selectedSpans) { + for(SAMFileSpan fileSpan: selectedSpans.values()) { + if(!fileSpan.isEmpty()) + return false; + } + return true; + } + }; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java index 18fafb95d..8cee535b0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -24,20 +24,49 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import net.sf.samtools.GATKBAMFileSpan; -import net.sf.samtools.SAMFileSpan; +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.NoSuchElementException; +import java.util.*; /** - * Divide up large file pointers containing reads into more manageable subcomponents. + * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards. * - * TODO: delete this class once the experimental downsampling engine fork collapses + * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig + * together into one monolithic FilePointer, create one persistent set of read iterators over + * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to + * fill read shards with reads. + * + * This strategy has several important advantages: + * + * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole + * contig will have regions that overlap with other FilePointers on the same contig, due + * to the limited granularity of BAM index data. By creating only one FilePointer per contig, + * we avoid having to track how much of each file region we've visited (as we did in the + * former implementation), we avoid expensive non-sequential access patterns in the files, + * and we avoid having to repeatedly re-create our iterator chain for every small region + * of interest. + * + * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single + * persistent set of read iterators (which include the downsampling iterator(s)) per contig, + * the downsampling process is never interrupted by FilePointer or Shard boundaries, and never + * loses crucial state information while downsampling within a contig. + * + * TODO: There is also at least one important disadvantage: + * + * 1. We load more BAM index data into memory at once, and this work is done upfront before processing + * the next contig, creating a delay before traversal of each contig. This delay may be + * compensated for by the gains listed in #1 above, and we may be no worse off overall in + * terms of total runtime, but we need to verify this empirically. + * + * @author David Roazen */ public class ReadShardBalancer extends ShardBalancer { + + private static Logger logger = Logger.getLogger(ReadShardBalancer.class); + /** * Convert iterators of file pointers into balanced iterators of shards. * @return An iterator over balanced shards. @@ -52,16 +81,27 @@ public class ReadShardBalancer extends ShardBalancer { /** * The file pointer currently being processed. */ - private FilePointer currentFilePointer; + private FilePointer currentContigFilePointer = null; /** - * Ending position of the last shard in the file. + * Iterator over the reads from the current contig's file pointer. The same iterator will be + * used to fill all shards associated with a given file pointer */ - private Map position = readsDataSource.getCurrentPosition(); + private PeekableIterator currentContigReadsIterator = null; + + /** + * How many FilePointers have we pulled from the filePointers iterator? + */ + private int totalFilePointersConsumed = 0; + + /** + * Have we encountered a monolithic FilePointer? + */ + private boolean encounteredMonolithicFilePointer = false; + { - if(filePointers.hasNext()) - currentFilePointer = filePointers.next(); + createNextContigFilePointer(); advance(); } @@ -70,58 +110,117 @@ public class ReadShardBalancer extends ShardBalancer { } public Shard next() { - if(!hasNext()) + if ( ! hasNext() ) throw new NoSuchElementException("No next read shard available"); Shard currentShard = nextShard; advance(); return currentShard; } - public void remove() { - throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); - } - private void advance() { - Map shardPosition; nextShard = null; - Map selectedReaders = new HashMap(); - while(selectedReaders.size() == 0 && currentFilePointer != null) { - shardPosition = currentFilePointer.fileSpans; + // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away + while ( nextShard == null && currentContigFilePointer != null ) { - for(SAMReaderID id: shardPosition.keySet()) { - SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); - selectedReaders.put(id,fileSpan); + // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): + if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) { + + // Close the old, exhausted chain of iterators to release resources + currentContigReadsIterator.close(); + + // Advance to the FilePointer for the next contig + createNextContigFilePointer(); + + // We'll need to create a fresh iterator for this file pointer when we create the first + // shard for it below. + currentContigReadsIterator = null; } - if(!isEmpty(selectedReaders)) { - Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); - readsDataSource.fillShard(shard); + // At this point our currentContigReadsIterator may be null or non-null depending on whether or not + // this is our first shard for this file pointer. + if ( currentContigFilePointer != null ) { + Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped); - if(!shard.isBufferEmpty()) { + // Create a new reads iterator only when we've just advanced to the file pointer for the next + // contig. It's essential that the iterators persist across all shards that share the same contig + // to allow the downsampling to work properly. + if ( currentContigReadsIterator == null ) { + currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); + } + + if ( currentContigReadsIterator.hasNext() ) { + shard.fill(currentContigReadsIterator); nextShard = shard; - break; } } - - selectedReaders.clear(); - currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; } - - position = readsDataSource.getCurrentPosition(); } /** - * Detects whether the list of file spans contain any read data. - * @param selectedSpans Mapping of readers to file spans. - * @return True if file spans are completely empty; false otherwise. + * Aggregate all FilePointers for the next contig together into one monolithic FilePointer + * to avoid boundary issues with visiting the same file regions more than once (since more + * granular FilePointers will have regions that overlap with other nearby FilePointers due + * to the nature of BAM indices). + * + * By creating one persistent set of iterators per contig we also avoid boundary artifacts + * in the engine-level downsampling. + * + * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for + * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely + * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should + * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for + * TODO: locus traversals). */ - private boolean isEmpty(Map selectedSpans) { - for(SAMFileSpan fileSpan: selectedSpans.values()) { - if(!fileSpan.isEmpty()) - return false; + private void createNextContigFilePointer() { + currentContigFilePointer = null; + List nextContigFilePointers = new ArrayList(); + + logger.info("Loading BAM index data for next contig"); + + while ( filePointers.hasNext() ) { + + // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that + // it is the ONLY FilePointer we ever encounter + if ( encounteredMonolithicFilePointer ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); + } + if ( filePointers.peek().isMonolithic() ) { + if ( totalFilePointersConsumed > 0 ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); + } + encounteredMonolithicFilePointer = true; + logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); + } + + // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the + // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge + if ( nextContigFilePointers.isEmpty() || + (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped && + nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) || + (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { + + nextContigFilePointers.add(filePointers.next()); + totalFilePointersConsumed++; + } + else { + break; // next FilePointer is on a different contig or has different mapped/unmapped status, + // save it for next time + } } - return true; + + if ( ! nextContigFilePointers.isEmpty() ) { + currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser); + } + + if ( currentContigFilePointer != null ) { + logger.info("Done loading BAM index data for next contig"); + logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer)); + } + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); } }; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 88de3ac9b..e99814278 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -466,7 +466,7 @@ public class SAMDataSource { /** * Legacy method to fill the given buffering shard with reads. * - * Shard.fill() is used instead of this method when experimental downsampling is enabled + * Shard.fill() is used instead of this method unless legacy downsampling is enabled * * TODO: delete this method once the experimental downsampling engine fork collapses * @@ -638,7 +638,8 @@ public class SAMDataSource { readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), readProperties.getSupplementalFilters(), readProperties.getReadTransformers(), - readProperties.defaultBaseQualities()); + readProperties.defaultBaseQualities(), + shard instanceof LocusShard); } private class BAMCodecIterator implements CloseableIterator { @@ -695,6 +696,7 @@ public class SAMDataSource { * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. * @param supplementalFilters additional filters to apply to the reads. * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. */ protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, @@ -705,7 +707,8 @@ public class SAMDataSource { Boolean noValidationOfReadOrder, Collection supplementalFilters, List readTransformers, - byte defaultBaseQualities) { + byte defaultBaseQualities, + boolean isLocusBasedTraversal ) { // ************************************************************************************************ // // * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // @@ -714,12 +717,26 @@ public class SAMDataSource { wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); - if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) { - wrappedIterator = applyDownsamplingIterator(wrappedIterator); + // If we're using the new downsampling implementation, apply downsampling iterators at this + // point in the read stream for most (but not all) cases + if ( ! readProperties.getDownsamplingMethod().useLegacyDownsampler ) { + + // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers + // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding + // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling + // of individual reads. + boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && + readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readProperties.getDownsamplingMethod().toCoverage != null; + + if ( ! assumeDownstreamLIBSDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } } - // Use the old fractional downsampler only if we're not using experimental downsampling: - if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null ) + // Use the old fractional downsampler only if we're using legacy downsampling: + // TODO: remove this statement (and associated classes) once the downsampling engine fork collapses + if ( readProperties.getDownsamplingMethod().useLegacyDownsampler && downsamplingFraction != null ) wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction); // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, @@ -741,19 +758,37 @@ public class SAMDataSource { } protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) { - if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { - ReadsDownsamplerFactory downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ? - new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage) : - new FractionalDownsamplerFactory(readProperties.getDownsamplingMethod().toFraction); - - return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory); + if ( readProperties.getDownsamplingMethod() == null || + readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { + return wrappedIterator; } - else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { - ReadsDownsampler downsampler = readProperties.getDownsamplingMethod().toCoverage != null ? - new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage) : - new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction); - return new DownsamplingReadsIterator(wrappedIterator, downsampler); + if ( readProperties.getDownsamplingMethod().toFraction != null ) { + + // If we're downsampling to a fraction of reads, there's no point in paying the cost of + // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on + // reads from each sample separately, since the result would be the same as running the + // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator + // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling + // was requested. + + return new DownsamplingReadsIterator(wrappedIterator, + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); + } + else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { + + // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling + // the read stream to run the downsampler on the reads for each individual sample separately if + // BY_SAMPLE downsampling was requested. + + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + return new PerSampleDownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + return new DownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); + } } return wrappedIterator; diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java index ae1d98ce0..b3f636fd6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java @@ -50,9 +50,9 @@ public class DownsamplingMethod { public final Double toFraction; /** - * Use the new experimental downsampling? + * Use the legacy downsampling implementation instead of the newer implementation? */ - public final boolean useExperimentalDownsampling; + public final boolean useLegacyDownsampler; /** * Expresses no downsampling applied at all. @@ -69,11 +69,11 @@ public class DownsamplingMethod { */ public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000; - public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) { + public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useLegacyDownsampler ) { this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; this.toCoverage = toCoverage; this.toFraction = toFraction; - this.useExperimentalDownsampling = useExperimentalDownsampling; + this.useLegacyDownsampler = useLegacyDownsampler; if ( type == DownsampleType.NONE ) { toCoverage = null; @@ -101,19 +101,19 @@ public class DownsamplingMethod { if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); } + } - // Some restrictions only exist for the old downsampling implementation: - if ( ! useExperimentalDownsampling ) { - // By sample downsampling does not work with a fraction of reads in the old downsampling implementation - if( type == DownsampleType.BY_SAMPLE && toFraction != null ) - throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method"); + public void checkCompatibilityWithWalker( Walker walker ) { + boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker; + + if ( ! isLocusTraversal && useLegacyDownsampler && toCoverage != null ) { + throw new UserException.CommandLineException("Downsampling to coverage for read-based traversals (eg., ReadWalkers) is not supported in the legacy downsampling implementation. " + + "The newer downsampling implementation does not have this limitation."); } - // Some restrictions only exist for the new downsampling implementation: - if ( useExperimentalDownsampling ) { - if ( type == DownsampleType.ALL_READS && toCoverage != null ) { - throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation"); - } + if ( isLocusTraversal && ! useLegacyDownsampler && type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException.CommandLineException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not yet supported in the new downsampling implementation (though it is supported for ReadWalkers). " + + "You can run with --use_legacy_downsampler for a broken and poorly-maintained implementation of ALL_READS to-coverage downsampling, but this is not recommended."); } } @@ -124,30 +124,34 @@ public class DownsamplingMethod { builder.append("No downsampling"); } else { - builder.append(String.format("Method: %s ", type)); + builder.append(String.format("Method: %s, ", type)); if ( toCoverage != null ) { - builder.append(String.format("Target Coverage: %d ", toCoverage)); + builder.append(String.format("Target Coverage: %d, ", toCoverage)); } else { - builder.append(String.format("Target Fraction: %.2f ", toFraction)); + builder.append(String.format("Target Fraction: %.2f, ", toFraction)); } - if ( useExperimentalDownsampling ) { - builder.append("Using Experimental Downsampling"); + if ( useLegacyDownsampler ) { + builder.append("Using the legacy downsampling implementation"); + } + else { + builder.append("Using the new downsampling implementation"); } } return builder.toString(); } - public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) { + public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useLegacyDownsampler ) { if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) { return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE, - null, useExperimentalDownsampling); + null, useLegacyDownsampler); } else { - return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling); + // Downsampling is off by default for non-locus-based traversals + return new DownsamplingMethod(DownsampleType.NONE, null, null, useLegacyDownsampler); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java new file mode 100644 index 000000000..008ffde3b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Pass-Through Downsampler: Implementation of the ReadsDownsampler interface that does no + * downsampling whatsoever, and instead simply "passes-through" all the reads it's given. + * Useful for situations where you want to disable downsampling, but still need to use + * the downsampler interface. + * + * @author David Roazen + */ +public class PassThroughDownsampler implements ReadsDownsampler { + + private ArrayList selectedReads; + + public PassThroughDownsampler() { + clear(); + } + + public void submit( T newRead ) { + // All reads pass-through, no reads get downsampled + selectedReads.add(newRead); + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasFinalizedItems() { + return selectedReads.size() > 0; + } + + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + List downsampledItems = selectedReads; + clear(); + return downsampledItems; + } + + public boolean hasPendingItems() { + return false; + } + + public T peekFinalized() { + return selectedReads.isEmpty() ? null : selectedReads.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return 0; + } + + public void signalEndOfInput() { + // NO-OP + } + + public void clear() { + selectedReads = new ArrayList(); + } + + public void reset() { + // NO-OP + } + + public boolean requiresCoordinateSortOrder() { + return false; + } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index 6c0dc9769..735a4dce5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,9 +4,9 @@ import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -83,17 +83,18 @@ public class WindowMaker implements Iterable, I this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - // Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested: - this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ? - new PeekableIterator(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames)) + // Use the legacy version of LocusIteratorByState if legacy downsampling was requested: + this.sourceIterator = sourceInfo.getDownsamplingMethod().useLegacyDownsampler ? + new PeekableIterator(new LegacyLocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames)) : - new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames)); + this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) { - this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + this(shard, genomeLocParser, iterator, intervals, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); } public Iterator iterator() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java similarity index 61% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java index 557cbd009..5c833de4c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java @@ -31,14 +31,14 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.Downsampler; -import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.LegacyReservoirDownsampler; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -50,11 +50,11 @@ import java.util.*; /** * Iterator that traverses a SAM File, accumulating information on a per-locus basis */ -public class LocusIteratorByStateExperimental extends LocusIterator { +public class LegacyLocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- // @@ -69,7 +69,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator { private final ArrayList samples; private final ReadStateManager readStates; - protected static class SAMRecordState { + static private class SAMRecordState { SAMRecord read; int readOffset = -1; // how far are we offset from the start of the read bases? int genomeOffset = -1; // how far are we offset from the alignment start on the genome? @@ -213,7 +213,6 @@ public class LocusIteratorByStateExperimental extends LocusIterator { //final boolean DEBUG2 = false && DEBUG; private ReadProperties readInfo; private AlignmentContext nextAlignmentContext; - private boolean performLevelingDownsampling; // ----------------------------------------------------------------------------------------------------------------- // @@ -221,15 +220,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator { // // ----------------------------------------------------------------------------------------------------------------- - public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { + public LegacyLocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator); - - this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null && - readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && - readInfo.getDownsamplingMethod().toCoverage != null; + this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true @@ -290,13 +285,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); - - // TODO: How can you determine here whether the current pileup has been downsampled? boolean hasBeenSampled = false; - for (final String sample : samples) { final Iterator iterator = readStates.iterator(sample); final List pile = new ArrayList(readStates.size(sample)); + hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); int size = 0; // number of elements in this sample's pileup int nDeletions = 0; // number of deletions in this sample's pileup @@ -405,20 +398,34 @@ public class LocusIteratorByStateExperimental extends LocusIterator { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - protected class ReadStateManager { + private class ReadStateManager { private final PeekableIterator iterator; + private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; private final Map readStatesBySample = new HashMap(); + private final int targetCoverage; private int totalReadStates = 0; - public ReadStateManager(Iterator source) { + public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { this.iterator = new PeekableIterator(source); - - for (final String sample : samples) { - readStatesBySample.put(sample, new PerSampleReadStateManager()); + this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; + switch (this.downsamplingMethod.type) { + case BY_SAMPLE: + if (downsamplingMethod.toCoverage == null) + throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); + this.targetCoverage = downsamplingMethod.toCoverage; + break; + default: + this.targetCoverage = Integer.MAX_VALUE; } - samplePartitioner = new SamplePartitioner(); + Map readSelectors = new HashMap(); + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); + } + + samplePartitioner = new SamplePartitioner(readSelectors); } /** @@ -442,6 +449,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator { public void remove() { wrappedIterator.remove(); + totalReadStates--; } }; } @@ -469,6 +477,17 @@ public class LocusIteratorByStateExperimental extends LocusIterator { return readStatesBySample.get(sample).size(); } + /** + * The extent of downsampling; basically, the furthest base out which has 'fallen + * victim' to the downsampler. + * + * @param sample Sample, downsampled independently. + * @return Integer stop of the furthest undownsampled region. + */ + public int getDownsamplingExtent(final String sample) { + return readStatesBySample.get(sample).getDownsamplingExtent(); + } + public SAMRecordState getFirst() { for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); @@ -501,13 +520,61 @@ public class LocusIteratorByStateExperimental extends LocusIterator { samplePartitioner.submitRead(iterator.next()); } } + samplePartitioner.complete(); for (final String sample : samples) { - Collection newReads = samplePartitioner.getReadsForSample(sample); - PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); - addReadsToSample(statesBySample, newReads); - } + ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); + Collection newReads = new ArrayList(aggregator.getSelectedReads()); + + PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + int numReads = statesBySample.size(); + int downsamplingExtent = aggregator.getDownsamplingExtent(); + + if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { + long readLimit = aggregator.getNumReadsSeen(); + addReadsToSample(statesBySample, newReads, readLimit); + statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); + } else { + int[] counts = statesBySample.getCountsPerAlignmentStart(); + int[] updatedCounts = new int[counts.length]; + System.arraycopy(counts, 0, updatedCounts, 0, counts.length); + + boolean readPruned = true; + while (numReads + newReads.size() > targetCoverage && readPruned) { + readPruned = false; + for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { + if (updatedCounts[alignmentStart] > 1) { + updatedCounts[alignmentStart]--; + numReads--; + readPruned = true; + } + } + } + + if (numReads == targetCoverage) { + updatedCounts[0]--; + numReads--; + } + + BitSet toPurge = new BitSet(readStates.size()); + int readOffset = 0; + + for (int i = 0; i < updatedCounts.length; i++) { + int n = counts[i]; + int k = updatedCounts[i]; + + for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) + toPurge.set(readOffset + purgedElement); + + readOffset += counts[i]; + } + downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); + + addReadsToSample(statesBySample, newReads, targetCoverage - numReads); + statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); + } + } samplePartitioner.reset(); } @@ -516,134 +583,380 @@ public class LocusIteratorByStateExperimental extends LocusIterator { * * @param readStates The list of read states to add this collection of reads. * @param reads Reads to add. Selected reads will be pulled from this source. + * @param maxReads Maximum number of reads to add. */ - private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); - + int readCount = 0; for (SAMRecord read : reads) { - SAMRecordState state = new SAMRecordState(read); - state.stepForwardOnGenome(); - newReadStates.add(state); + if (readCount < maxReads) { + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); + readCount++; + } } - readStates.addStatesAtNextAlignmentStart(newReadStates); } - protected class PerSampleReadStateManager implements Iterable { - private List> readStatesByAlignmentStart = new LinkedList>(); - private int thisSampleReadStates = 0; - private Downsampler> levelingDownsampler = - performLevelingDownsampling ? - new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : - null; + private class PerSampleReadStateManager implements Iterable { + private final Queue readStates = new LinkedList(); + private final Deque readStateCounter = new LinkedList(); + private int downsamplingExtent = 0; public void addStatesAtNextAlignmentStart(Collection states) { - if ( states.isEmpty() ) { - return; - } - - readStatesByAlignmentStart.add(new LinkedList(states)); - thisSampleReadStates += states.size(); + readStates.addAll(states); + readStateCounter.add(new Counter(states.size())); totalReadStates += states.size(); - - if ( levelingDownsampler != null ) { - levelingDownsampler.submit(readStatesByAlignmentStart); - levelingDownsampler.signalEndOfInput(); - - thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - - // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); - levelingDownsampler.reset(); - } } public boolean isEmpty() { - return readStatesByAlignmentStart.isEmpty(); + return readStates.isEmpty(); } public SAMRecordState peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); + return readStates.peek(); } public int size() { - return thisSampleReadStates; + return readStates.size(); + } + + public void specifyNewDownsamplingExtent(int downsamplingExtent) { + this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public int[] getCountsPerAlignmentStart() { + int[] counts = new int[readStateCounter.size()]; + int index = 0; + for (Counter counter : readStateCounter) + counts[index++] = counter.getCount(); + return counts; } public Iterator iterator() { return new Iterator() { - private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates = null; - private Iterator currentPositionReadStatesIterator = null; + private Iterator wrappedIterator = readStates.iterator(); public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); + return wrappedIterator.hasNext(); } public SAMRecordState next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); + return wrappedIterator.next(); } public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - totalReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } + wrappedIterator.remove(); + Counter counter = readStateCounter.peek(); + counter.decrement(); + if (counter.getCount() == 0) + readStateCounter.remove(); } }; } + + /** + * Purge the given elements from the bitset. If an element in the bitset is true, purge + * the corresponding read state. + * + * @param elements bits from the set to purge. + * @return the extent of the final downsampled read. + */ + public int purge(final BitSet elements) { + int downsamplingExtent = 0; + + if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + + Iterator readStateIterator = readStates.iterator(); + + Iterator counterIterator = readStateCounter.iterator(); + Counter currentCounter = counterIterator.next(); + + int readIndex = 0; + long alignmentStartCounter = currentCounter.getCount(); + + int toPurge = elements.nextSetBit(0); + int removedCount = 0; + + while (readStateIterator.hasNext() && toPurge >= 0) { + SAMRecordState state = readStateIterator.next(); + downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); + + if (readIndex == toPurge) { + readStateIterator.remove(); + currentCounter.decrement(); + if (currentCounter.getCount() == 0) + counterIterator.remove(); + removedCount++; + toPurge = elements.nextSetBit(toPurge + 1); + } + + readIndex++; + alignmentStartCounter--; + if (alignmentStartCounter == 0 && counterIterator.hasNext()) { + currentCounter = counterIterator.next(); + alignmentStartCounter = currentCounter.getCount(); + } + } + + totalReadStates -= removedCount; + + return downsamplingExtent; + } } } /** - * Note: stores reads by sample ID string, not by sample object + * Note: assuming that, whenever we downsample, we downsample to an integer capacity. */ - private class SamplePartitioner { - private Map> readsBySample; - private long readsSeen = 0; + static private class Counter { + private int count; - public SamplePartitioner() { - readsBySample = new HashMap>(); - - for ( String sample : samples ) { - readsBySample.put(sample, new ArrayList()); - } + public Counter(int count) { + this.count = count; } - public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).add(read); - readsSeen++; + public int getCount() { + return count; } - public long getNumReadsSeen() { - return readsSeen; - } - - public Collection getReadsForSample(String sampleName) { - if ( ! readsBySample.containsKey(sampleName) ) - throw new NoSuchElementException("Sample name not found"); - return readsBySample.get(sampleName); - } - - public void reset() { - for ( Collection perSampleReads : readsBySample.values() ) - perSampleReads.clear(); - readsSeen = 0; + public void decrement() { + count--; } } -} \ No newline at end of file +} + +/** + * Selects reads passed to it based on a criteria decided through inheritance. + * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this. + */ +interface ReadSelector { + /** + * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. + * + * @param read the read to evaluate. + */ + public void submitRead(SAMRecord read); + + /** + * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. + * + * @param read the read previously rejected. + */ + public void notifyReadRejected(SAMRecord read); + + /** + * Signal the selector that read additions are complete. + */ + public void complete(); + + /** + * Retrieve the number of reads seen by this selector so far. + * + * @return number of reads seen. + */ + public long getNumReadsSeen(); + + /** + * Return the number of reads accepted by this selector so far. + * + * @return number of reads selected. + */ + public long getNumReadsSelected(); + + /** + * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the + * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at + * position 3 whose cigar string is 76M, the value of this parameter will be 78. + * + * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. + */ + public int getDownsamplingExtent(); + + /** + * Get the reads selected by this selector. + * + * @return collection of reads selected by this selector. + */ + public Collection getSelectedReads(); + + /** + * Reset this collection to its pre-gathered state. + */ + public void reset(); +} + +/** + * Select every read passed in. + */ +class AllReadsSelector implements ReadSelector { + private Collection reads = new LinkedList(); + private long readsSeen = 0; + private int downsamplingExtent = 0; + + public void submitRead(SAMRecord read) { + reads.add(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public Collection getSelectedReads() { + return reads; + } + + public void reset() { + reads.clear(); + readsSeen = 0; + downsamplingExtent = 0; + } +} + + +/** + * Select N reads randomly from the input stream. + */ +class NRandomReadSelector implements ReadSelector { + private final LegacyReservoirDownsampler reservoir; + private final ReadSelector chainedSelector; + private long readsSeen = 0; + private int downsamplingExtent = 0; + + public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { + this.reservoir = new LegacyReservoirDownsampler((int) readLimit); + this.chainedSelector = chainedSelector; + } + + public void submitRead(SAMRecord read) { + SAMRecord displaced = reservoir.add(read); + if (displaced != null && chainedSelector != null) { + chainedSelector.notifyReadRejected(read); + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); + } + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + } + + public void complete() { + for (SAMRecord read : reservoir.getDownsampledContents()) + chainedSelector.submitRead(read); + if (chainedSelector != null) + chainedSelector.complete(); + } + + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return reservoir.size(); + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public Collection getSelectedReads() { + return reservoir.getDownsampledContents(); + } + + public void reset() { + reservoir.clear(); + downsamplingExtent = 0; + if (chainedSelector != null) + chainedSelector.reset(); + } +} + +/** + * Note: stores reads by sample ID string, not by sample object + */ +class SamplePartitioner implements ReadSelector { + private final Map readsBySample; + private long readsSeen = 0; + + public SamplePartitioner(Map readSelectors) { + readsBySample = readSelectors; + } + + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).submitRead(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).notifyReadRejected(read); + readsSeen++; + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public int getDownsamplingExtent() { + int downsamplingExtent = 0; + for (ReadSelector storage : readsBySample.values()) + downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); + return downsamplingExtent; + } + + public Collection getSelectedReads() { + throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); + } + + public ReadSelector getSelectedReads(String sampleName) { + if (!readsBySample.containsKey(sampleName)) + throw new NoSuchElementException("Sample name not found"); + return readsBySample.get(sampleName); + } + + public void reset() { + for (ReadSelector storage : readsBySample.values()) + storage.reset(); + readsSeen = 0; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 46e84798a..4f8d7d3f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -31,14 +31,11 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.downsampling.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.ReservoirDownsampler; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -54,7 +51,7 @@ public class LocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- // @@ -69,7 +66,7 @@ public class LocusIteratorByState extends LocusIterator { private final ArrayList samples; private final ReadStateManager readStates; - static private class SAMRecordState { + protected static class SAMRecordState { SAMRecord read; int readOffset = -1; // how far are we offset from the start of the read bases? int genomeOffset = -1; // how far are we offset from the alignment start on the genome? @@ -213,6 +210,7 @@ public class LocusIteratorByState extends LocusIterator { //final boolean DEBUG2 = false && DEBUG; private ReadProperties readInfo; private AlignmentContext nextAlignmentContext; + private boolean performDownsampling; // ----------------------------------------------------------------------------------------------------------------- // @@ -224,7 +222,18 @@ public class LocusIteratorByState extends LocusIterator { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); + + // LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're + // downsampling to coverage by sample. SAMDataSource will have refrained from applying + // any downsamplers to the read stream in this case, in the expectation that LIBS will + // manage the downsampling. The reason for this is twofold: performance (don't have to + // split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling + // of reads (eg., using half of a read, and throwing the rest away). + this.performDownsampling = readInfo.getDownsamplingMethod() != null && + readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readInfo.getDownsamplingMethod().toCoverage != null; + + this.readStates = new ReadStateManager(samIterator); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true @@ -285,11 +294,13 @@ public class LocusIteratorByState extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); + + // TODO: How can you determine here whether the current pileup has been downsampled? boolean hasBeenSampled = false; + for (final String sample : samples) { final Iterator iterator = readStates.iterator(sample); final List pile = new ArrayList(readStates.size(sample)); - hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); int size = 0; // number of elements in this sample's pileup int nDeletions = 0; // number of deletions in this sample's pileup @@ -398,34 +409,20 @@ public class LocusIteratorByState extends LocusIterator { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - private class ReadStateManager { + protected class ReadStateManager { private final PeekableIterator iterator; - private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; private final Map readStatesBySample = new HashMap(); - private final int targetCoverage; private int totalReadStates = 0; - public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { + public ReadStateManager(Iterator source) { this.iterator = new PeekableIterator(source); - this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; - switch (this.downsamplingMethod.type) { - case BY_SAMPLE: - if (downsamplingMethod.toCoverage == null) - throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); - this.targetCoverage = downsamplingMethod.toCoverage; - break; - default: - this.targetCoverage = Integer.MAX_VALUE; - } - Map readSelectors = new HashMap(); for (final String sample : samples) { readStatesBySample.put(sample, new PerSampleReadStateManager()); - readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); } - samplePartitioner = new SamplePartitioner(readSelectors); + samplePartitioner = new SamplePartitioner(performDownsampling); } /** @@ -449,7 +446,6 @@ public class LocusIteratorByState extends LocusIterator { public void remove() { wrappedIterator.remove(); - totalReadStates--; } }; } @@ -477,17 +473,6 @@ public class LocusIteratorByState extends LocusIterator { return readStatesBySample.get(sample).size(); } - /** - * The extent of downsampling; basically, the furthest base out which has 'fallen - * victim' to the downsampler. - * - * @param sample Sample, downsampled independently. - * @return Integer stop of the furthest undownsampled region. - */ - public int getDownsamplingExtent(final String sample) { - return readStatesBySample.get(sample).getDownsamplingExtent(); - } - public SAMRecordState getFirst() { for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); @@ -520,61 +505,15 @@ public class LocusIteratorByState extends LocusIterator { samplePartitioner.submitRead(iterator.next()); } } - samplePartitioner.complete(); + + samplePartitioner.doneSubmittingReads(); for (final String sample : samples) { - ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); - - Collection newReads = new ArrayList(aggregator.getSelectedReads()); - + Collection newReads = samplePartitioner.getReadsForSample(sample); PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); - int numReads = statesBySample.size(); - int downsamplingExtent = aggregator.getDownsamplingExtent(); - - if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { - long readLimit = aggregator.getNumReadsSeen(); - addReadsToSample(statesBySample, newReads, readLimit); - statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } else { - int[] counts = statesBySample.getCountsPerAlignmentStart(); - int[] updatedCounts = new int[counts.length]; - System.arraycopy(counts, 0, updatedCounts, 0, counts.length); - - boolean readPruned = true; - while (numReads + newReads.size() > targetCoverage && readPruned) { - readPruned = false; - for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { - if (updatedCounts[alignmentStart] > 1) { - updatedCounts[alignmentStart]--; - numReads--; - readPruned = true; - } - } - } - - if (numReads == targetCoverage) { - updatedCounts[0]--; - numReads--; - } - - BitSet toPurge = new BitSet(readStates.size()); - int readOffset = 0; - - for (int i = 0; i < updatedCounts.length; i++) { - int n = counts[i]; - int k = updatedCounts[i]; - - for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) - toPurge.set(readOffset + purgedElement); - - readOffset += counts[i]; - } - downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); - - addReadsToSample(statesBySample, newReads, targetCoverage - numReads); - statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } + addReadsToSample(statesBySample, newReads); } + samplePartitioner.reset(); } @@ -583,380 +522,140 @@ public class LocusIteratorByState extends LocusIterator { * * @param readStates The list of read states to add this collection of reads. * @param reads Reads to add. Selected reads will be pulled from this source. - * @param maxReads Maximum number of reads to add. */ - private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); - int readCount = 0; + for (SAMRecord read : reads) { - if (readCount < maxReads) { - SAMRecordState state = new SAMRecordState(read); - state.stepForwardOnGenome(); - newReadStates.add(state); - readCount++; - } + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); } + readStates.addStatesAtNextAlignmentStart(newReadStates); } - private class PerSampleReadStateManager implements Iterable { - private final Queue readStates = new LinkedList(); - private final Deque readStateCounter = new LinkedList(); - private int downsamplingExtent = 0; + protected class PerSampleReadStateManager implements Iterable { + private List> readStatesByAlignmentStart = new LinkedList>(); + private int thisSampleReadStates = 0; + private Downsampler> levelingDownsampler = + performDownsampling ? + new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : + null; public void addStatesAtNextAlignmentStart(Collection states) { - readStates.addAll(states); - readStateCounter.add(new Counter(states.size())); + if ( states.isEmpty() ) { + return; + } + + readStatesByAlignmentStart.add(new LinkedList(states)); + thisSampleReadStates += states.size(); totalReadStates += states.size(); + + if ( levelingDownsampler != null ) { + levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.signalEndOfInput(); + + thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + levelingDownsampler.reset(); + } } public boolean isEmpty() { - return readStates.isEmpty(); + return readStatesByAlignmentStart.isEmpty(); } public SAMRecordState peek() { - return readStates.peek(); + return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); } public int size() { - return readStates.size(); - } - - public void specifyNewDownsamplingExtent(int downsamplingExtent) { - this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public int[] getCountsPerAlignmentStart() { - int[] counts = new int[readStateCounter.size()]; - int index = 0; - for (Counter counter : readStateCounter) - counts[index++] = counter.getCount(); - return counts; + return thisSampleReadStates; } public Iterator iterator() { return new Iterator() { - private Iterator wrappedIterator = readStates.iterator(); + private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates = null; + private Iterator currentPositionReadStatesIterator = null; public boolean hasNext() { - return wrappedIterator.hasNext(); + return alignmentStartIterator.hasNext() || + (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); } public SAMRecordState next() { - return wrappedIterator.next(); + if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { + currentPositionReadStates = alignmentStartIterator.next(); + currentPositionReadStatesIterator = currentPositionReadStates.iterator(); + } + + return currentPositionReadStatesIterator.next(); } public void remove() { - wrappedIterator.remove(); - Counter counter = readStateCounter.peek(); - counter.decrement(); - if (counter.getCount() == 0) - readStateCounter.remove(); + currentPositionReadStatesIterator.remove(); + thisSampleReadStates--; + totalReadStates--; + + if ( currentPositionReadStates.isEmpty() ) { + alignmentStartIterator.remove(); + } } }; } + } + } - /** - * Purge the given elements from the bitset. If an element in the bitset is true, purge - * the corresponding read state. - * - * @param elements bits from the set to purge. - * @return the extent of the final downsampled read. - */ - public int purge(final BitSet elements) { - int downsamplingExtent = 0; + /** + * Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler. + * + * Note: stores reads by sample ID string, not by sample object + */ + private class SamplePartitioner { + private Map> readsBySample; - if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + public SamplePartitioner( boolean downsampleReads ) { + readsBySample = new HashMap>(); - Iterator readStateIterator = readStates.iterator(); + for ( String sample : samples ) { + readsBySample.put(sample, + downsampleReads ? new ReservoirDownsampler(readInfo.getDownsamplingMethod().toCoverage) : + new PassThroughDownsampler()); + } + } - Iterator counterIterator = readStateCounter.iterator(); - Counter currentCounter = counterIterator.next(); + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).submit(read); + } - int readIndex = 0; - long alignmentStartCounter = currentCounter.getCount(); + public void doneSubmittingReads() { + for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { + perSampleReads.getValue().signalEndOfInput(); + } + } - int toPurge = elements.nextSetBit(0); - int removedCount = 0; + public Collection getReadsForSample(String sampleName) { + if ( ! readsBySample.containsKey(sampleName) ) + throw new NoSuchElementException("Sample name not found"); - while (readStateIterator.hasNext() && toPurge >= 0) { - SAMRecordState state = readStateIterator.next(); - downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); + return readsBySample.get(sampleName).consumeFinalizedItems(); + } - if (readIndex == toPurge) { - readStateIterator.remove(); - currentCounter.decrement(); - if (currentCounter.getCount() == 0) - counterIterator.remove(); - removedCount++; - toPurge = elements.nextSetBit(toPurge + 1); - } - - readIndex++; - alignmentStartCounter--; - if (alignmentStartCounter == 0 && counterIterator.hasNext()) { - currentCounter = counterIterator.next(); - alignmentStartCounter = currentCounter.getCount(); - } - } - - totalReadStates -= removedCount; - - return downsamplingExtent; + public void reset() { + for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { + perSampleReads.getValue().clear(); + perSampleReads.getValue().reset(); } } } - - /** - * Note: assuming that, whenever we downsample, we downsample to an integer capacity. - */ - static private class Counter { - private int count; - - public Counter(int count) { - this.count = count; - } - - public int getCount() { - return count; - } - - public void decrement() { - count--; - } - } -} - -/** - * Selects reads passed to it based on a criteria decided through inheritance. - * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this. - */ -interface ReadSelector { - /** - * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. - * - * @param read the read to evaluate. - */ - public void submitRead(SAMRecord read); - - /** - * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. - * - * @param read the read previously rejected. - */ - public void notifyReadRejected(SAMRecord read); - - /** - * Signal the selector that read additions are complete. - */ - public void complete(); - - /** - * Retrieve the number of reads seen by this selector so far. - * - * @return number of reads seen. - */ - public long getNumReadsSeen(); - - /** - * Return the number of reads accepted by this selector so far. - * - * @return number of reads selected. - */ - public long getNumReadsSelected(); - - /** - * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the - * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at - * position 3 whose cigar string is 76M, the value of this parameter will be 78. - * - * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. - */ - public int getDownsamplingExtent(); - - /** - * Get the reads selected by this selector. - * - * @return collection of reads selected by this selector. - */ - public Collection getSelectedReads(); - - /** - * Reset this collection to its pre-gathered state. - */ - public void reset(); -} - -/** - * Select every read passed in. - */ -class AllReadsSelector implements ReadSelector { - private Collection reads = new LinkedList(); - private long readsSeen = 0; - private int downsamplingExtent = 0; - - public void submitRead(SAMRecord read) { - reads.add(read); - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - readsSeen++; - downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); - } - - public void complete() { - // NO-OP. - } - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return readsSeen; - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public Collection getSelectedReads() { - return reads; - } - - public void reset() { - reads.clear(); - readsSeen = 0; - downsamplingExtent = 0; - } -} - - -/** - * Select N reads randomly from the input stream. - */ -class NRandomReadSelector implements ReadSelector { - private final ReservoirDownsampler reservoir; - private final ReadSelector chainedSelector; - private long readsSeen = 0; - private int downsamplingExtent = 0; - - public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { - this.reservoir = new ReservoirDownsampler((int) readLimit); - this.chainedSelector = chainedSelector; - } - - public void submitRead(SAMRecord read) { - SAMRecord displaced = reservoir.add(read); - if (displaced != null && chainedSelector != null) { - chainedSelector.notifyReadRejected(read); - downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); - } - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - readsSeen++; - } - - public void complete() { - for (SAMRecord read : reservoir.getDownsampledContents()) - chainedSelector.submitRead(read); - if (chainedSelector != null) - chainedSelector.complete(); - } - - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return reservoir.size(); - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public Collection getSelectedReads() { - return reservoir.getDownsampledContents(); - } - - public void reset() { - reservoir.clear(); - downsamplingExtent = 0; - if (chainedSelector != null) - chainedSelector.reset(); - } -} - -/** - * Note: stores reads by sample ID string, not by sample object - */ -class SamplePartitioner implements ReadSelector { - private final Map readsBySample; - private long readsSeen = 0; - - public SamplePartitioner(Map readSelectors) { - readsBySample = readSelectors; - } - - public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).submitRead(read); - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).notifyReadRejected(read); - readsSeen++; - } - - public void complete() { - // NO-OP. - } - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return readsSeen; - } - - public int getDownsamplingExtent() { - int downsamplingExtent = 0; - for (ReadSelector storage : readsBySample.values()) - downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); - return downsamplingExtent; - } - - public Collection getSelectedReads() { - throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); - } - - public ReadSelector getSelectedReads(String sampleName) { - if (!readsBySample.containsKey(sampleName)) - throw new NoSuchElementException("Sample name not found"); - return readsBySample.get(sampleName); - } - - public void reset() { - for (ReadSelector storage : readsBySample.values()) - storage.reset(); - readsSeen = 0; - } - -} +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index 7ae2bb453..605a6680f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -343,7 +343,7 @@ public class GATKReport { GATKReportTable table = tables.firstEntry().getValue(); if ( table.getNumColumns() != values.length ) - throw new ReviewedStingException("The number of arguments in writeRow() " + values.length + " must match the number of columns in the table" + table.getNumColumns()); + throw new ReviewedStingException("The number of arguments in writeRow (" + values.length + ") must match the number of columns in the table (" + table.getNumColumns() + ")" ); final int rowIndex = table.getNumRows(); for ( int i = 0; i < values.length; i++ ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index dbb0e8cdd..d6a5cb16d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -52,7 +51,7 @@ public class AFCalcResult { private final double[] log10PriorsOfAC; private final double[] log10PosteriorsOfAC; - private final Map log10pNonRefByAllele; + private final Map log10pRefByAllele; /** * The AC values for all ALT alleles at the MLE @@ -74,16 +73,16 @@ public class AFCalcResult { final List allelesUsedInGenotyping, final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC, - final Map log10pNonRefByAllele) { + final Map log10pRefByAllele) { if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping); if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null"); if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size()); if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations); if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2"); if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2"); - if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); - if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); - if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( log10pRefByAllele == null ) throw new IllegalArgumentException("log10pRefByAllele cannot be null"); + if ( log10pRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pRefByAllele has the wrong number of elements: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( ! allelesUsedInGenotyping.containsAll(log10pRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pRefByAllele doesn't contain all of the alleles used in genotyping: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); @@ -94,7 +93,7 @@ public class AFCalcResult { this.log10LikelihoodsOfAC = Arrays.copyOf(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES); this.log10PriorsOfAC = Arrays.copyOf(log10PriorsOfAC, LOG_10_ARRAY_SIZES); this.log10PosteriorsOfAC = computePosteriors(log10LikelihoodsOfAC, log10PriorsOfAC); - this.log10pNonRefByAllele = new HashMap(log10pNonRefByAllele); + this.log10pRefByAllele = new HashMap(log10pRefByAllele); } /** @@ -104,7 +103,7 @@ public class AFCalcResult { * @return */ public AFCalcResult withNewPriors(final double[] log10PriorsOfAC) { - return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); } /** @@ -219,7 +218,7 @@ public class AFCalcResult { public String toString() { final List byAllele = new LinkedList(); for ( final Allele a : getAllelesUsedInGenotyping() ) - if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a))); + if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFEq0ForAllele(a))); return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele)); } @@ -237,7 +236,7 @@ public class AFCalcResult { */ @Requires("MathUtils.goodLog10Probability(log10minPNonRef)") public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { - return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; + return getLog10PosteriorOfAFEq0ForAllele(allele) < log10minPNonRef; } /** @@ -245,7 +244,7 @@ public class AFCalcResult { */ public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); - final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); + final double log10Threshold = minPNonRefPhredScaledQual / -10; return isPolymorphic(allele, log10Threshold); } @@ -263,7 +262,16 @@ public class AFCalcResult { } /** - * Returns the log10 probability that allele is segregating + * Returns the log10 probability that allele is not segregating + * + * Note that this function is p not segregating so that we can store + * internally the log10 value of AF == 0, which grows very quickly + * negative and yet has sufficient resolution for high confidence tests. + * For example, if log10pRef == -100, not an unreasonably high number, + * if we tried to store log10pNonRef we'd be looking at 1 - 10^-100, which + * quickly underflows to 1. So the logic here is backward from what + * you really want (the p of segregating) but we do that for numerical + * reasons * * Unlike the sites-level annotation, this calculation is specific to allele, and can be * used to separately determine how much evidence there is that allele is independently @@ -272,11 +280,11 @@ public class AFCalcResult { * evidence for one allele but not so much for any other allele * * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping - * @return the log10 probability that allele is segregating at this site + * @return the log10 probability that allele is not segregating at this site */ @Ensures("MathUtils.goodLog10Probability(result)") - public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { - final Double log10pNonRef = log10pNonRefByAllele.get(allele); + public double getLog10PosteriorOfAFEq0ForAllele(final Allele allele) { + final Double log10pNonRef = log10pRefByAllele.get(allele); if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); return log10pNonRef; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java index f13fe4429..b138ddf70 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -77,7 +77,7 @@ public class ExactCallLogger implements Cloneable { for ( final Allele allele : result.getAllelesUsedInGenotyping() ) { if ( allele.isNonReference() ) { printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele)); - printCallElement(vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele)); + printCallElement(vc, "pRefByAllele", allele, result.getLog10PosteriorOfAFEq0ForAllele(allele)); } } @@ -123,7 +123,7 @@ public class ExactCallLogger implements Cloneable { final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true); final List mle = new ArrayList(); - final Map log10pNonRefByAllele = new HashMap(); + final Map log10pRefByAllele = new HashMap(); long runtimeNano = -1; GenomeLoc currentLoc = null; @@ -148,7 +148,7 @@ public class ExactCallLogger implements Cloneable { builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[]{})); - final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); + final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; @@ -165,9 +165,9 @@ public class ExactCallLogger implements Cloneable { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); - } else if (variable.equals("pNonRefByAllele")) { + } else if (variable.equals("pRefByAllele")) { final Allele a = Allele.create(key); - log10pNonRefByAllele.put(a, Double.valueOf(value)); + log10pRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index d0b801a20..937ef2ffc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -125,8 +125,8 @@ import java.util.*; */ final List supporting; - private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pNonRefByAllele, List supporting) { - super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pRefByAllele, List supporting) { + super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); this.supporting = supporting; } } @@ -323,7 +323,7 @@ import java.util.*; final int nAltAlleles = sortedResultsWithThetaNPriors.size(); final int[] alleleCountsOfMLE = new int[nAltAlleles]; final double[] log10PriorsOfAC = new double[2]; - final Map log10pNonRefByAllele = new HashMap(nAltAlleles); + final Map log10pRefByAllele = new HashMap(nAltAlleles); // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs double log10PosteriorOfACEq0Sum = 0.0; @@ -348,7 +348,7 @@ import java.util.*; log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior - log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); + log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0()); // trivial -- update the number of evaluations nEvaluations += sortedResultWithThetaNPriors.nEvaluations; @@ -384,6 +384,6 @@ import java.util.*; MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // priors incorporate multiple alt alleles, must be normalized MathUtils.normalizeFromLog10(log10PriorsOfAC, true), - log10pNonRefByAllele, sortedResultsWithThetaNPriors); + log10pRefByAllele, sortedResultsWithThetaNPriors); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index fc26111e0..67cc79646 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -30,13 +30,13 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); - final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; - final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); + final double log10PRef = log10Posteriors[1] > log10Posteriors[0] ? MathUtils.LOG10_P_OF_ZERO : 0.0; + final Map log10pRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PRef); return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), MathUtils.normalizeFromLog10(log10Likelihoods, true), MathUtils.normalizeFromLog10(log10Priors, true), - log10pNonRefByAllele); + log10pRefByAllele); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index b82ec1d29..ad6361a3f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -165,14 +165,14 @@ final class StateTracker { final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + final Map log10pRefByAllele = new HashMap(allelesUsedInGenotyping.size()); for ( int i = 0; i < subACOfMLE.length; i++ ) { final Allele allele = allelesUsedInGenotyping.get(i+1); - final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was - log10pNonRefByAllele.put(allele, log10PNonRef); + final double log10PRef = alleleCountsOfMAP[i] > 0 ? -10000 : 0; // TODO -- a total hack but in effect what the old behavior was + log10pRefByAllele.put(allele, log10PRef); } - return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pRefByAllele); } // -------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 7b797432d..848aaf8a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -287,6 +287,9 @@ public class PairHMMIndelErrorModel { if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; } + else if (startLocationInRefForHaplotypes > ref.getWindow().getStop()) { + startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely; + } if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context @@ -338,6 +341,8 @@ public class PairHMMIndelErrorModel { if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) startLocationInRefForHaplotypes = haplotype.getStartPosition(); + else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) + startLocationInRefForHaplotypes = haplotype.getStopPosition(); final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); @@ -347,8 +352,6 @@ public class PairHMMIndelErrorModel { System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java deleted file mode 100755 index d38c11594..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.*; - -/** - * Emits intervals present in either the original or reduced bam but not the other. - * - *

Input

- *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals present in one bam but not the other. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedCoverage \
- *   -o output.intervals
- * 
- * - * @author ebanks - */ -@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) -@Hidden -public class AssessReducedCoverage extends LocusWalker implements TreeReducible { - - private static final String original = "original"; - private static final String reduced = "reduced"; - - @Output - protected PrintStream out; - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) - public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; - - public void initialize() {} - - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if ( tracker == null ) - return null; - - Set tags = getAllTags(context.getBasePileup()); - return (tags.contains(original) && !tags.contains(reduced)) || - (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; - } - - private Set getAllTags(final ReadBackedPileup pileup) { - - final Set tags = new HashSet(10); - - for ( final PileupElement p : pileup ) { - if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) - tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); - } - - return tags; - } - - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - public GenomeLoc reduceInit() { - return null; - } - - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java deleted file mode 100644 index 78bcf1228..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ /dev/null @@ -1,173 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.List; - -/** - * Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of - * the reduced bam are above sufficient threshold) - * - *

Input

- *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedQuals \
- *   -o output.intervals
- * 
- * - * @author ami - */ - -public class AssessReducedQuals extends LocusWalker implements TreeReducible { - - private static final String reduced = "reduced"; - private static final String original = "original"; - private static final int originalQualsIndex = 0; - private static final int reducedQualsIndex = 1; - - @Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false) - public int sufficientQualSum = 600; - - @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > epsilon*Quals_original_bam we output this interval", required = false) - public int qual_epsilon = 0; - - @Argument(fullName = "debugLevel", shortName = "debug", doc = "debug mode on") // TODO -- best to make this optional - public int debugLevel = 0; // TODO -- best to make this an enum or boolean - - @Output - protected PrintStream out; - - public void initialize() { - if (debugLevel != 0) - out.println(" Debug mode" + - "Debug:\tsufficientQualSum: "+sufficientQualSum+ "\n " + - "Debug:\tqual_epsilon: "+qual_epsilon); - } - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Override - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return null; - - boolean reportLocus; - final int[] quals = getPileupQuals(context.getBasePileup()); - if (debugLevel != 0) - out.println("Debug:\tLocus Quals\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]); - final int epsilon = MathUtils.fastRound(quals[originalQualsIndex]*qual_epsilon); - final int calcOriginalQuals = Math.min(quals[originalQualsIndex],sufficientQualSum); - final int calcReducedQuals = Math.min(quals[reducedQualsIndex],sufficientQualSum); - final int OriginalReducedQualDiff = calcOriginalQuals - calcReducedQuals; - reportLocus = OriginalReducedQualDiff > epsilon || OriginalReducedQualDiff < -1*epsilon; - if(debugLevel != 0 && reportLocus) - out.println("Debug:\tEmited Locus\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]+"\tepsilon\t"+epsilon+"\tdiff\t"+OriginalReducedQualDiff); - - return reportLocus ? ref.getLocus() : null; - } - - private final int[] getPileupQuals(final ReadBackedPileup readPileup) { - - final int[] quals = new int[2]; - String[] printPileup = {"Debug 2:\toriginal pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n", - "Debug 2:\t reduced pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n"}; - - for( PileupElement p : readPileup ){ - final List tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags(); - if ( isGoodRead(p,tags) ){ - final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount(); - final int tagIndex = getTagIndex(tags); - quals[tagIndex] += tempQual; - if(debugLevel == 2) - printPileup[tagIndex] += "\tDebug 2: ("+p+")\tMQ="+p.getMappingQual()+":QU="+p.getQual()+":RC="+p.getRepresentativeCount()+":OS="+p.getOffset()+"\n"; - } - } - if(debugLevel == 2){ - out.println(printPileup[originalQualsIndex]); - out.println(printPileup[reducedQualsIndex]); - } - return quals; - } - - // TODO -- arguments/variables should be final, not method declaration - private final boolean isGoodRead(PileupElement p, List tags){ - // TODO -- this isn't quite right. You don't need the tags here. Instead, you want to check whether the read itself (which - // TODO -- you can get from the PileupElement) is a reduced read (not all reads from the reduced bam are reduced) and only - // TODO -- for them do you want to ignore that min mapping quality cutoff (but you *do* still want the min base cutoff). - return !p.isDeletion() && (tags.contains(reduced) || (tags.contains(original) && (int)p.getQual() >= 20 && p.getMappingQual() >= 20)); - } - - private final int getTagIndex(List tags){ - return tags.contains(reduced) ? 1 : 0; - } - - @Override - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - @Override - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - @Override - public GenomeLoc reduceInit() { - return null; - } - - @Override - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index b1d8dc91d..1d4913769 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -134,7 +134,7 @@ public class CombineVariants extends RodWalker implements Tree protected VariantContextWriter vcfWriter = null; @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) - public VariantContextUtils.GenotypeMergeType genotypeMergeOption = VariantContextUtils.GenotypeMergeType.PRIORITIZE; + public VariantContextUtils.GenotypeMergeType genotypeMergeOption = null; @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; @@ -200,13 +200,13 @@ public class CombineVariants extends RodWalker implements Tree } else logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); - if ( PRIORITY_STRING == null ) { + validateAnnotateUnionArguments(); + if ( PRIORITY_STRING == null && genotypeMergeOption == null) { genotypeMergeOption = VariantContextUtils.GenotypeMergeType.UNSORTED; //PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); Deleted by Ami (7/10/12) - logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING); + logger.info("Priority string not provided, using arbitrary genotyping order: "+priority); } - validateAnnotateUnionArguments(); samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); if ( SET_KEY.toLowerCase().equals("null") ) @@ -228,16 +228,15 @@ public class CombineVariants extends RodWalker implements Tree if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); - if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE ) + if ( PRIORITY_STRING != null || genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE ){ priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); - else - priority = new ArrayList(rodNames); + if ( rodNames.size() != priority.size() ) + throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); - if ( rodNames.size() != priority.size() ) - throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); + if ( ! rodNames.containsAll(priority) ) + throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); + } - if ( ! rodNames.containsAll(priority) ) - throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java similarity index 94% rename from public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java rename to public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java index a758df431..ba863310d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java @@ -8,6 +8,8 @@ import java.util.Collection; import java.util.Iterator; /** + * THIS IMPLEMENTATION IS BROKEN AND WILL BE REMOVED ONCE THE DOWNSAMPLING ENGINE FORK COLLAPSES + * * Randomly downsample from a stream of elements. This algorithm is a direct, * naive implementation of reservoir downsampling as described in "Random Downsampling * with a Reservoir" (Vitter 1985). At time of writing, this paper is located here: @@ -16,7 +18,7 @@ import java.util.Iterator; * @author mhanna * @version 0.1 */ -public class ReservoirDownsampler { +public class LegacyReservoirDownsampler { /** * The reservoir of elements tracked by this downsampler. */ @@ -31,7 +33,7 @@ public class ReservoirDownsampler { * Create a new downsampler with the given source iterator and given comparator. * @param maxElements What is the maximum number of reads that can be returned in any call of this */ - public ReservoirDownsampler(final int maxElements) { + public LegacyReservoirDownsampler(final int maxElements) { if(maxElements < 0) throw new ReviewedStingException("Unable to work with an negative size collection of elements"); this.reservoir = new ArrayList(maxElements); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 61c1c51b4..8186f69cf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -32,11 +32,10 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.walkers.qc.CountLoci; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -85,7 +84,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); - LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + LegacyLocusIteratorByState locusIteratorByState = new LegacyLocusIteratorByState(readIterator,readProperties,genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); while(locusIteratorByState.hasNext()) { locusIteratorByState.next().getLocation(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java similarity index 97% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java index 0807f36dc..73664141f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java @@ -45,10 +45,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -public class ExperimentalReadShardBalancerUnitTest extends BaseTest { +public class ReadShardBalancerUnitTest extends BaseTest { /** - * Tests to ensure that ExperimentalReadShardBalancer works as expected and does not place shard boundaries + * Tests to ensure that ReadShardBalancer works as expected and does not place shard boundaries * at inappropriate places, such as within an alignment start position */ private static class ExperimentalReadShardBalancerTest extends TestDataProvider { @@ -74,7 +74,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { this.stackSize = stackSize; this.numUnmappedReads = numUnmappedReads; - this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, false); this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads; setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", @@ -96,7 +96,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { new ArrayList(), false); - Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ExperimentalReadShardBalancer()); + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); SAMRecord readAtEndOfLastShard = null; int totalReadsSeen = 0; diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java similarity index 50% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java index a49a602c6..44d182661 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java @@ -7,10 +7,8 @@ import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -21,14 +19,17 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** - * testing of the experimental version of LocusIteratorByState + * testing of the LEGACY version of LocusIteratorByState */ -public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { +public class LegacyLocusIteratorByStateUnitTest extends BaseTest { private static SAMFileHeader header; - private LocusIteratorByStateExperimental li; + private LegacyLocusIteratorByState li; private GenomeLocParser genomeLocParser; @BeforeClass @@ -37,8 +38,8 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } - private LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { - return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); + private LegacyLocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + return new LegacyLocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); } @Test @@ -328,184 +329,14 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { // End comprehensive LIBS/PileupElement tests // //////////////////////////////////////////////// - - /////////////////////////////////////// - // Read State Manager Tests // - /////////////////////////////////////// - - private class PerSampleReadStateManagerTest extends TestDataProvider { - private List readCountsPerAlignmentStart; - private List reads; - private List> recordStatesByAlignmentStart; - private int removalInterval; - - public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { - super(PerSampleReadStateManagerTest.class); - - this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; - this.removalInterval = removalInterval; - - reads = new ArrayList(); - recordStatesByAlignmentStart = new ArrayList>(); - - setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", - getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); - } - - public void run() { - LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); - LocusIteratorByStateExperimental.ReadStateManager readStateManager = - libs.new ReadStateManager(new ArrayList().iterator()); - LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = - readStateManager.new PerSampleReadStateManager(); - - makeReads(); - - for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { - perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); - } - - // read state manager should have the right number of reads - Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); - - Iterator originalReadsIterator = reads.iterator(); - Iterator recordStateIterator = perSampleReadStateManager.iterator(); - int recordStateCount = 0; - int numReadStatesRemoved = 0; - - // Do a first-pass validation of the record state iteration by making sure we get back everything we - // put in, in the same order, doing any requested removals of read states along the way - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - recordStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - SAMRecord originalRead = originalReadsIterator.next(); - - // The read we get back should be literally the same read in memory as we put in - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - - // If requested, remove a read state every removalInterval states - if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { - recordStateIterator.remove(); - numReadStatesRemoved++; - } - } - - Assert.assertFalse(originalReadsIterator.hasNext()); - - // If we removed any read states, do a second pass through the read states to make sure the right - // states were removed - if ( numReadStatesRemoved > 0 ) { - Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); - - originalReadsIterator = reads.iterator(); - recordStateIterator = perSampleReadStateManager.iterator(); - int readCount = 0; - int readStateCount = 0; - - // Match record states with the reads that should remain after removal - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - readStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - - SAMRecord originalRead = originalReadsIterator.next(); - readCount++; - - if ( readCount % removalInterval == 0 ) { - originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded - readCount++; - } - - // The read we get back should be literally the same read in memory as we put in (after accounting for removals) - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - } - - Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); - } - - // Allow memory used by this test to be reclaimed - readCountsPerAlignmentStart = null; - reads = null; - recordStatesByAlignmentStart = null; - } - - private void makeReads() { - int alignmentStart = 1; - - for ( int readsThisStack : readCountsPerAlignmentStart ) { - ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); - ArrayList stackRecordStates = new ArrayList(); - - for ( SAMRecord read : stackReads ) { - stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); - } - - reads.addAll(stackReads); - recordStatesByAlignmentStart.add(stackRecordStates); - } - } - } - - @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") - public Object[][] createPerSampleReadStateManagerTests() { - for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), - Arrays.asList(2), - Arrays.asList(10), - Arrays.asList(1, 1), - Arrays.asList(2, 2), - Arrays.asList(10, 10), - Arrays.asList(1, 10), - Arrays.asList(10, 1), - Arrays.asList(1, 1, 1), - Arrays.asList(2, 2, 2), - Arrays.asList(10, 10, 10), - Arrays.asList(1, 1, 1, 1, 1, 1), - Arrays.asList(10, 10, 10, 10, 10, 10), - Arrays.asList(1, 2, 10, 1, 2, 10) - ) ) { - - for ( int removalInterval : Arrays.asList(0, 2, 3) ) { - new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); - } - } - - return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); - } - - @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") - public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { - logger.warn("Running test: " + test); - - test.run(); - } - - /////////////////////////////////////// - // End Read State Manager Tests // - /////////////////////////////////////// - - - - /////////////////////////////////////// - // Helper methods / classes // - /////////////////////////////////////// - private static ReadProperties createTestReadProperties() { - return createTestReadProperties(null); - } - - private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, - downsamplingMethod, + null, new ValidationExclusion(), Collections.emptyList(), Collections.emptyList(), @@ -513,136 +344,137 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { (byte) -1 ); } +} - private static class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; +class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; - public FakeCloseableIterator(Iterator it) { - iterator = it; - } - - @Override - public void close() {} - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public T next() { - return iterator.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } + public FakeCloseableIterator(Iterator it) { + iterator = it; } - private static final class LIBS_position { + @Override + public void close() {} - SAMRecord read; + @Override + public boolean hasNext() { + return iterator.hasNext(); + } - final int numOperators; - int currentOperatorIndex = 0; - int currentPositionOnOperator = 0; - int currentReadOffset = 0; + @Override + public T next() { + return iterator.next(); + } - boolean isBeforeDeletionStart = false; - boolean isBeforeDeletedBase = false; - boolean isAfterDeletionEnd = false; - boolean isAfterDeletedBase = false; - boolean isBeforeInsertion = false; - boolean isAfterInsertion = false; - boolean isNextToSoftClip = false; - - boolean sawMop = false; - - public LIBS_position(final SAMRecord read) { - this.read = read; - numOperators = read.getCigar().numCigarElements(); - } - - public int getCurrentReadOffset() { - return Math.max(0, currentReadOffset - 1); - } - - /** - * Steps forward on the genome. Returns false when done reading the read, true otherwise. - */ - public boolean stepForwardOnGenome() { - if ( currentOperatorIndex == numOperators ) - return false; - - CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); - if ( currentPositionOnOperator >= curElement.getLength() ) { - if ( ++currentOperatorIndex == numOperators ) - return false; - - curElement = read.getCigar().getCigarElement(currentOperatorIndex); - currentPositionOnOperator = 0; - } - - switch ( curElement.getOperator() ) { - case I: // insertion w.r.t. the reference - if ( !sawMop ) - break; - case S: // soft clip - currentReadOffset += curElement.getLength(); - case H: // hard clip - case P: // padding - currentOperatorIndex++; - return stepForwardOnGenome(); - - case D: // deletion w.r.t. the reference - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - currentPositionOnOperator++; - break; - - case M: - case EQ: - case X: - sawMop = true; - currentReadOffset++; - currentPositionOnOperator++; - break; - default: - throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); - } - - final boolean isFirstOp = currentOperatorIndex == 0; - final boolean isLastOp = currentOperatorIndex == numOperators - 1; - final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; - final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); - - isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); - isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); - isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); - isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); - isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) - || (!sawMop && curElement.getOperator() == CigarOperator.I); - isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); - isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) - || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); - - return true; - } - - private static boolean isBeforeOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isLastOp, - final boolean isLastBaseOfOp) { - return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; - } - - private static boolean isAfterOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isFirstOp, - final boolean isFirstBaseOfOp) { - return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; - } + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } +} + + +final class LIBS_position { + + SAMRecord read; + + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; + + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + + boolean sawMop = false; + + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } + + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } + + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) + return false; + + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; + + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; + break; + + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } + + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; + } + + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } + + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 83913fa76..4e4126ad4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -7,8 +7,10 @@ import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -19,13 +21,10 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.Arrays; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; +import java.util.*; /** - * testing of the LocusIteratorByState + * testing of the new (non-legacy) version of LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends BaseTest { private static SAMFileHeader header; @@ -329,14 +328,184 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // End comprehensive LIBS/PileupElement tests // //////////////////////////////////////////////// + + /////////////////////////////////////// + // Read State Manager Tests // + /////////////////////////////////////// + + private class PerSampleReadStateManagerTest extends TestDataProvider { + private List readCountsPerAlignmentStart; + private List reads; + private List> recordStatesByAlignmentStart; + private int removalInterval; + + public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { + super(PerSampleReadStateManagerTest.class); + + this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; + this.removalInterval = removalInterval; + + reads = new ArrayList(); + recordStatesByAlignmentStart = new ArrayList>(); + + setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", + getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); + } + + public void run() { + LocusIteratorByState libs = makeLTBS(new ArrayList(), createTestReadProperties()); + LocusIteratorByState.ReadStateManager readStateManager = + libs.new ReadStateManager(new ArrayList().iterator()); + LocusIteratorByState.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = + readStateManager.new PerSampleReadStateManager(); + + makeReads(); + + for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { + perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + } + + // read state manager should have the right number of reads + Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); + + Iterator originalReadsIterator = reads.iterator(); + Iterator recordStateIterator = perSampleReadStateManager.iterator(); + int recordStateCount = 0; + int numReadStatesRemoved = 0; + + // Do a first-pass validation of the record state iteration by making sure we get back everything we + // put in, in the same order, doing any requested removals of read states along the way + while ( recordStateIterator.hasNext() ) { + LocusIteratorByState.SAMRecordState readState = recordStateIterator.next(); + recordStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + SAMRecord originalRead = originalReadsIterator.next(); + + // The read we get back should be literally the same read in memory as we put in + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + + // If requested, remove a read state every removalInterval states + if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { + recordStateIterator.remove(); + numReadStatesRemoved++; + } + } + + Assert.assertFalse(originalReadsIterator.hasNext()); + + // If we removed any read states, do a second pass through the read states to make sure the right + // states were removed + if ( numReadStatesRemoved > 0 ) { + Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); + + originalReadsIterator = reads.iterator(); + recordStateIterator = perSampleReadStateManager.iterator(); + int readCount = 0; + int readStateCount = 0; + + // Match record states with the reads that should remain after removal + while ( recordStateIterator.hasNext() ) { + LocusIteratorByState.SAMRecordState readState = recordStateIterator.next(); + readStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + + SAMRecord originalRead = originalReadsIterator.next(); + readCount++; + + if ( readCount % removalInterval == 0 ) { + originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded + readCount++; + } + + // The read we get back should be literally the same read in memory as we put in (after accounting for removals) + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + } + + Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); + } + + // Allow memory used by this test to be reclaimed + readCountsPerAlignmentStart = null; + reads = null; + recordStatesByAlignmentStart = null; + } + + private void makeReads() { + int alignmentStart = 1; + + for ( int readsThisStack : readCountsPerAlignmentStart ) { + ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); + ArrayList stackRecordStates = new ArrayList(); + + for ( SAMRecord read : stackReads ) { + stackRecordStates.add(new LocusIteratorByState.SAMRecordState(read)); + } + + reads.addAll(stackReads); + recordStatesByAlignmentStart.add(stackRecordStates); + } + } + } + + @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") + public Object[][] createPerSampleReadStateManagerTests() { + for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), + Arrays.asList(2), + Arrays.asList(10), + Arrays.asList(1, 1), + Arrays.asList(2, 2), + Arrays.asList(10, 10), + Arrays.asList(1, 10), + Arrays.asList(10, 1), + Arrays.asList(1, 1, 1), + Arrays.asList(2, 2, 2), + Arrays.asList(10, 10, 10), + Arrays.asList(1, 1, 1, 1, 1, 1), + Arrays.asList(10, 10, 10, 10, 10, 10), + Arrays.asList(1, 2, 10, 1, 2, 10) + ) ) { + + for ( int removalInterval : Arrays.asList(0, 2, 3) ) { + new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); + } + } + + return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); + } + + @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") + public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } + + /////////////////////////////////////// + // End Read State Manager Tests // + /////////////////////////////////////// + + + + /////////////////////////////////////// + // Helper methods / classes // + /////////////////////////////////////// + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, - null, + downsamplingMethod, new ValidationExclusion(), Collections.emptyList(), Collections.emptyList(), @@ -344,137 +513,136 @@ public class LocusIteratorByStateUnitTest extends BaseTest { (byte) -1 ); } -} -class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; - public FakeCloseableIterator(Iterator it) { - iterator = it; + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() {} + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } } - @Override - public void close() {} + private static final class LIBS_position { - @Override - public boolean hasNext() { - return iterator.hasNext(); - } + SAMRecord read; - @Override - public T next() { - return iterator.next(); - } + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } -} + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + boolean sawMop = false; -final class LIBS_position { + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } - SAMRecord read; + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } - final int numOperators; - int currentOperatorIndex = 0; - int currentPositionOnOperator = 0; - int currentReadOffset = 0; - - boolean isBeforeDeletionStart = false; - boolean isBeforeDeletedBase = false; - boolean isAfterDeletionEnd = false; - boolean isAfterDeletedBase = false; - boolean isBeforeInsertion = false; - boolean isAfterInsertion = false; - boolean isNextToSoftClip = false; - - boolean sawMop = false; - - public LIBS_position(final SAMRecord read) { - this.read = read; - numOperators = read.getCigar().numCigarElements(); - } - - public int getCurrentReadOffset() { - return Math.max(0, currentReadOffset - 1); - } - - /** - * Steps forward on the genome. Returns false when done reading the read, true otherwise. - */ - public boolean stepForwardOnGenome() { - if ( currentOperatorIndex == numOperators ) - return false; - - CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); - if ( currentPositionOnOperator >= curElement.getLength() ) { - if ( ++currentOperatorIndex == numOperators ) + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) return false; - curElement = read.getCigar().getCigarElement(currentOperatorIndex); - currentPositionOnOperator = 0; - } + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; - switch ( curElement.getOperator() ) { - case I: // insertion w.r.t. the reference - if ( !sawMop ) + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; break; - case S: // soft clip - currentReadOffset += curElement.getLength(); - case H: // hard clip - case P: // padding - currentOperatorIndex++; - return stepForwardOnGenome(); - case D: // deletion w.r.t. the reference - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - currentPositionOnOperator++; - break; + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } - case M: - case EQ: - case X: - sawMop = true; - currentReadOffset++; - currentPositionOnOperator++; - break; - default: - throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; } - final boolean isFirstOp = currentOperatorIndex == 0; - final boolean isLastOp = currentOperatorIndex == numOperators - 1; - final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; - final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } - isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); - isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); - isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); - isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); - isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) - || (!sawMop && curElement.getOperator() == CigarOperator.I); - isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); - isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) - || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); - - return true; - } - - private static boolean isBeforeOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isLastOp, - final boolean isLastBaseOfOp) { - return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; - } - - private static boolean isAfterOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isFirstOp, - final boolean isFirstBaseOfOp) { - return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index a65b0cb45..69907d485 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -109,12 +109,16 @@ public class TraverseActiveRegionsTest extends BaseTest { dictionary = reference.getSequenceDictionary(); genomeLocParser = new GenomeLocParser(dictionary); + // TODO: test shard boundaries + intervals = new ArrayList(); intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + intervals.add(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); @@ -126,6 +130,7 @@ public class TraverseActiveRegionsTest extends BaseTest { reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + reads.add(buildSAMRecord("end_of_chr1", "1", 249250600, 249250700)); reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); // required by LocusIteratorByState, and I prefer to list them in test case order above @@ -139,8 +144,6 @@ public class TraverseActiveRegionsTest extends BaseTest { List activeIntervals = getIsActiveIntervals(walker, intervals); // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call verifyEqualIntervals(intervals, activeIntervals); - - // TODO: more tests and edge cases } private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { @@ -171,8 +174,6 @@ public class TraverseActiveRegionsTest extends BaseTest { Collection activeRegions = getActiveRegions(walker, intervals).values(); verifyActiveRegionCoverage(intervals, activeRegions); - - // TODO: more tests and edge cases } private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { @@ -231,6 +232,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -245,6 +247,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -256,6 +259,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -267,6 +271,19 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -278,9 +295,8 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); - - // TODO: more tests and edge cases } @Test @@ -300,6 +316,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -314,6 +331,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -325,6 +343,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -336,6 +355,19 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -347,9 +379,8 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); - - // TODO: more tests and edge cases } @Test @@ -370,6 +401,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -384,6 +416,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -395,6 +428,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -406,6 +440,19 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -417,9 +464,13 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); + } - // TODO: more tests and edge cases + @Test + public void testUnmappedReads() { + // TODO } private void verifyReadNotPlaced(ActiveRegion region, String readName) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index bf1fc9e65..d9af2ea7e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; +import org.broadinstitute.sting.gatk.datasources.reads.LegacyReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; @@ -114,7 +114,7 @@ public class TraverseReadsUnitTest extends BaseTest { @Test public void testUnmappedReadCount() { SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); - Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new LegacyReadShardBalancer()); countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit(); diff --git a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java index 5b052454a..47096c73a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java @@ -23,14 +23,14 @@ public class LegacyReservoirDownsamplerUnitTest { @Test public void testEmptyIterator() { - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); Assert.assertTrue(downsampler.isEmpty(),"Downsampler is not empty but should be."); } @Test public void testOneElementWithPoolSizeOne() { List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -42,7 +42,7 @@ public class LegacyReservoirDownsamplerUnitTest { @Test public void testOneElementWithPoolSizeGreaterThanOne() { List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -58,7 +58,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -78,7 +78,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -99,7 +99,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(0); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(0); downsampler.addAll(reads); Assert.assertTrue(downsampler.isEmpty(),"Downsampler isn't empty but should be"); @@ -115,7 +115,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -128,7 +128,7 @@ public class LegacyReservoirDownsamplerUnitTest { public void testFillingAcrossLoci() { List reads = new ArrayList(); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be");