From a1d6461121c75ac4c58af86c32df70fbf9447eb0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 28 Nov 2012 11:35:41 -0500 Subject: [PATCH 1/5] Critical bugfix to AFCalcResult affecting UG/HC quality score emission thresholds As reported by Menachem Fromer: a critical bug in AFCalcResult: Specifically, the implementation: public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; } seems incorrect and should probably be: getLog10PosteriorOfAFEq0ForAllele(allele) <= log10minPNonRef The issue here is that the 30 represents a Phred-scaled probability of *error* and it's currently being compared to a log probability of *non-error*. Instead, we need to require that our probability of error be less than the error threshold. This bug has only a minor impact on the calls -- hardly any sites change -- which is good. But the inverted logic effects multi-allelic sites significantly. Basically you only hit this logic with multiple alleles, and in that case it'\s including extra alt alleles incorrectly, and throwing out good ones. Change was to create a new function that properly handles thresholds that are PhredScaled quality scores: /** * Same as #isPolymorphic but takes a phred-scaled quality score as input */ public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); return isPolymorphic(allele, log10Threshold); } --- .../UnifiedGenotyperIntegrationTest.java | 8 +-- .../afcalc/AFCalcResultUnitTest.java | 56 +++++++++++++++++-- .../genotyper/UnifiedGenotyperEngine.java | 8 +-- .../genotyper/afcalc/AFCalcResult.java | 11 ++++ 4 files changed, 70 insertions(+), 13 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9212d0e53..88f2ab3ea 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("94dc17d76d841f1d3a36160767ffa034")); + Arrays.asList("a373979d01c3a3fb20159235d27eb92c")); executeTest("test Multiple SNP alleles", spec); } @@ -197,7 +197,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "8a263fd0a94463ce1de9990f2b8ec841"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "5c75cecb523cac988beecd59186289ff"); } private void testOutputParameters(final String args, final String md5) { @@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("f7d0d0aee603df25c1f0525bb8df189e")); + Arrays.asList("a5a81bf1b10be860a6a5272fb928e8eb")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("fc91d457a16b4ca994959c2b5f3f0352")); + Arrays.asList("ad52814cd6c45df424fc992699feead6")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index cbe2eb268..96e055e92 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -2,16 +2,14 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.util.*; public class AFCalcResultUnitTest extends BaseTest { private static class MyTest { @@ -79,4 +77,54 @@ public class AFCalcResultUnitTest extends BaseTest { final double[] actualPosteriors = new double[]{result.getLog10PosteriorOfAFEq0(), result.getLog10PosteriorOfAFGT0()}; Assert.assertEquals(MathUtils.sumLog10(actualPosteriors), 1.0, 1e-3, "Posteriors don't sum to 1 with 1e-3 precision"); } + + @DataProvider(name = "TestIsPolymorphic") + public Object[][] makeTestIsPolymorphic() { + List tests = new ArrayList(); + + final List pValues = new LinkedList(); + for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999) ) + for ( final double espilon : Arrays.asList(-1e-5, 0.0, 1e-5) ) + pValues.add(p + espilon); + + for ( final double pNonRef : pValues ) { + for ( final double pThreshold : pValues ) { + final boolean shouldBePoly = pNonRef >= pThreshold; + if ( pNonRef != pThreshold) + // let's not deal with numerical instability + tests.add(new Object[]{ pNonRef, pThreshold, shouldBePoly }); + } + } + + return tests.toArray(new Object[][]{}); + } + + private AFCalcResult makePolymorphicTestData(final double pNonRef) { + return new AFCalcResult( + new int[]{0}, + 1, + alleles, + MathUtils.normalizeFromLog10(new double[]{1 - pNonRef, pNonRef}, true, false), + log10Even, + Collections.singletonMap(C, Math.log10(pNonRef))); + } + + @Test(enabled = true, dataProvider = "TestIsPolymorphic") + private void testIsPolymorphic(final double pNonRef, final double pThreshold, final boolean shouldBePoly) { + final AFCalcResult result = makePolymorphicTestData(pNonRef); + final boolean actualIsPoly = result.isPolymorphic(C, Math.log10(pThreshold)); + Assert.assertEquals(actualIsPoly, shouldBePoly, + "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " + + actualIsPoly + " but the expected result is " + shouldBePoly); + } + + @Test(enabled = true, dataProvider = "TestIsPolymorphic") + private void testIsPolymorphicQual(final double pNonRef, final double pThreshold, final boolean shouldBePoly) { + final AFCalcResult result = makePolymorphicTestData(pNonRef); + final double qual = QualityUtils.phredScaleCorrectRate(pThreshold); + final boolean actualIsPoly = result.isPolymorphicPhredScaledQual(C, qual); + Assert.assertEquals(actualIsPoly, shouldBePoly, + "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " + + actualIsPoly + " but the expected result is " + shouldBePoly); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 97254c478..f22187363 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -378,11 +378,9 @@ public class UnifiedGenotyperEngine { if ( alternateAllele.isReference() ) continue; - // we are non-ref if the probability of being non-ref > the emit confidence. - // the emit confidence is phred-scaled, say 30 => 10^-3. - // the posterior AF > 0 is log10: -5 => 10^-5 - // we are non-ref if 10^-5 < 10^-3 => -5 < -3 - final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); + // Compute if the site is considered polymorphic with sufficient confidence relative to our + // phred-scaled emission QUAL + final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); // if the most likely AC is not 0, then this is a good alternate allele to use if ( isNonRef ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index a65772444..dbb0e8cdd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -234,10 +235,20 @@ public class AFCalcResult { * * @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0 */ + @Requires("MathUtils.goodLog10Probability(log10minPNonRef)") public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; } + /** + * Same as #isPolymorphic but takes a phred-scaled quality score as input + */ + public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { + if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); + final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); + return isPolymorphic(allele, log10Threshold); + } + /** * Are any of the alleles polymorphic w.r.t. #isPolymorphic? * From 6030605242386c3f04313b797b7d9f6719c67475 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 28 Nov 2012 13:26:31 -0500 Subject: [PATCH 2/5] Added quick check for creation of bad BAQ values associated with badly encoded base qualities; hopefully this can help us debug the non-reproducible issue seen by many users. --- .../broadinstitute/sting/utils/QualityUtils.java | 2 +- .../src/org/broadinstitute/sting/utils/baq/BAQ.java | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 1242e5b00..848beccb8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,7 +9,7 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { - public final static byte MAX_RECALIBRATED_Q_SCORE = 93; + public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index cf4d699ee..9ad1bf773 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -406,10 +406,15 @@ public class BAQ { // so BQi = Qi - BAQi + 64 byte[] bqTag = new byte[baq.length]; for ( int i = 0; i < bqTag.length; i++) { - int bq = (int)read.getBaseQualities()[i] + 64; - int baq_i = (int)baq[i]; - int tag = bq - baq_i; - if ( tag < 0 ) throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read); + final int bq = (int)read.getBaseQualities()[i] + 64; + final int baq_i = (int)baq[i]; + final int tag = bq - baq_i; + // problem with the calculation of the correction factor; this is our problem + if ( tag < 0 ) + throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read); + // the original quality is too high, almost certainly due to using the wrong encoding in the BAM file + if ( tag > Byte.MAX_VALUE ) + throw new UserException.MalformedBAM(read, "we encountered an extremely high quality score (" + (bq - 64) + ") with BAQ correction factor of " + baq_i + "; the BAM file appears to be using the wrong encoding for quality scores"); bqTag[i] = (byte)tag; } return new String(bqTag); From f0395b457ac4a2c7e40a573f6a46d8e0065b33d4 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 28 Nov 2012 13:56:32 -0500 Subject: [PATCH 3/5] Adding the work-in-progress, experimental RepeatLengthCovariate to the BQSR so Chris can continue the development. --- .../covariates/RepeatLengthCovariate.java | 64 +++++++++++++++++++ .../variantcontext/VariantContextUtils.java | 2 +- 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java new file mode 100644 index 000000000..d4e4ab65e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java @@ -0,0 +1,64 @@ +package org.broadinstitute.sting.utils.recalibration.covariates; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.TandemRepeat; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 11/3/12 + */ + +public class RepeatLengthCovariate implements ExperimentalCovariate { + final int MAX_REPEAT_LENGTH = 20; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) {} + + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + byte[] readBytes = read.getReadBases(); + for (int i = 0; i < readBytes.length; i++) { + int maxRL = 0; + for (int str = 1; str <= 8; str++) { + if (i + str <= readBytes.length) { + maxRL = Math.max(maxRL, VariantContextUtils.findNumberofRepetitions( + Arrays.copyOfRange(readBytes,i,i + str), + Arrays.copyOfRange(readBytes,i,readBytes.length) + )); + } + } + if(maxRL > MAX_REPEAT_LENGTH) { maxRL = MAX_REPEAT_LENGTH; } + values.addCovariate(maxRL, maxRL, maxRL, i); + } + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return Byte.parseByte(str); + } + + @Override + public String formatKey(final int key) { + return String.format("%d", key); + } + + @Override + public int keyFromValue(final Object value) { + return (value instanceof String) ? Integer.parseInt((String) value) : (Integer) value; + } + + @Override + public int maximumKeyValue() { + return MAX_REPEAT_LENGTH + 1; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 1f1867f75..b3e3cf8df 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -1267,7 +1267,7 @@ public class VariantContextUtils { * @param testString String to test * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's */ - protected static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString) { + public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString) { int numRepeats = 0; for (int start = 0; start < testString.length; start += repeatUnit.length) { int end = start + repeatUnit.length; From 198923b597e3635ec5f71aaa192ac4bcaca36ddd Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 26 Nov 2012 15:01:13 -0500 Subject: [PATCH 4/5] Add ActiveRegionReadState handling --- .../haplotypecaller/HaplotypeCaller.java | 11 +- .../traversals/TraverseActiveRegions.java | 16 +- .../gatk/walkers/ActiveRegionWalker.java | 20 +- .../activeregion/ActiveRegionReadState.java | 16 ++ .../traversals/TraverseActiveRegionsTest.java | 207 +++++++++++++++--- 5 files changed, 230 insertions(+), 40 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 24b3309f1..d194e2620 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -295,9 +296,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Override public boolean includeReadsWithDeletionAtLoci() { return true; } - // enable non primary reads in the active region + // enable non primary and extended reads in the active region @Override - public boolean wantsNonPrimaryReads() { return true; } + public EnumSet desiredReadStates() { + return EnumSet.of( + ActiveRegionReadState.PRIMARY, + ActiveRegionReadState.NONPRIMARY, + ActiveRegionReadState.EXTENDED + ); + } @Override @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 3f20db0af..06fc01232 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -258,13 +258,23 @@ public class TraverseActiveRegions extends TraversalEngine extends Walker desiredReadStates() { + return EnumSet.of(ActiveRegionReadState.PRIMARY); + } + + public final boolean wantsNonPrimaryReads() { + return desiredReadStates().contains(ActiveRegionReadState.NONPRIMARY); + } + + public boolean wantsExtendedReads() { + return desiredReadStates().contains(ActiveRegionReadState.EXTENDED); + } + + public boolean wantsUnmappedReads() { + return desiredReadStates().contains(ActiveRegionReadState.UNMAPPED); } // Determine probability of active status over the AlignmentContext diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java new file mode 100644 index 000000000..00e491eb0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java @@ -0,0 +1,16 @@ +package org.broadinstitute.sting.utils.activeregion; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/26/12 + * Time: 2:35 PM + * + * Describes how a read relates to an assigned ActiveRegion + */ +public enum ActiveRegionReadState { + PRIMARY, // This is the read's primary region + NONPRIMARY, // This region overlaps the read, but it is not primary + EXTENDED, // This region would overlap the read if it were extended + UNMAPPED // This read is not mapped +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index 66504da11..b70085eff 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.traversals; import com.google.java.contract.PreconditionError; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -46,6 +47,8 @@ public class TraverseActiveRegionsTest extends BaseTest { private class DummyActiveRegionWalker extends ActiveRegionWalker { private final double prob; + private EnumSet states = super.desiredReadStates(); + protected List isActiveCalls = new ArrayList(); protected Map mappedActiveRegions = new HashMap(); @@ -57,6 +60,16 @@ public class TraverseActiveRegionsTest extends BaseTest { this.prob = constProb; } + public DummyActiveRegionWalker(EnumSet wantStates) { + this.prob = 1.0; + this.states = wantStates; + } + + @Override + public EnumSet desiredReadStates() { + return states; + } + @Override public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { isActiveCalls.add(ref.getLocus()); @@ -202,12 +215,158 @@ public class TraverseActiveRegionsTest extends BaseTest { } @Test - public void testReadMapping() { + public void testPrimaryReadMapping() { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); // Contract: Each read has the Primary state in a single region (or none) // This is the region of maximum overlap for the read (earlier if tied) + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_only: Extended in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + + getRead(region, "simple"); + getRead(region, "overlap_equal"); + getRead(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + getRead(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail getRead(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + getRead(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "simple20"); + + // TODO: more tests and edge cases + } + + @Test + public void testNonPrimaryReadMapping() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker( + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_only: Extended in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + + getRead(region, "simple"); + getRead(region, "overlap_equal"); + getRead(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail getRead(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + getRead(region, "boundary_equal"); + getRead(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail getRead(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + getRead(region, "boundary_equal"); + getRead(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "simple20"); + + // TODO: more tests and edge cases + } + + @Test + public void testExtendedReadMapping() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker( + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + // Contract: Each read has the Non-Primary state in all other regions it overlaps // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended @@ -226,13 +385,13 @@ public class TraverseActiveRegionsTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadPrimary(region, "simple"); - verifyReadPrimary(region, "overlap_equal"); - verifyReadPrimary(region, "overlap_unequal"); + getRead(region, "simple"); + getRead(region, "overlap_equal"); + getRead(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_only"); - // TODO: fail verifyReadNonPrimary(region, "extended_and_np"); + // TODO: fail getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -241,10 +400,10 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "simple"); verifyReadNotPlaced(region, "overlap_equal"); verifyReadNotPlaced(region, "overlap_unequal"); - // TODO: fail verifyReadNonPrimary(region, "boundary_equal"); - verifyReadPrimary(region, "boundary_unequal"); + getRead(region, "boundary_equal"); + getRead(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_only"); - // TODO: fail verifyReadPrimary(region, "extended_and_np"); + // TODO: fail getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -253,10 +412,10 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "simple"); verifyReadNotPlaced(region, "overlap_equal"); verifyReadNotPlaced(region, "overlap_unequal"); - verifyReadPrimary(region, "boundary_equal"); - // TODO: fail verifyReadNonPrimary(region, "boundary_unequal"); - // TODO: fail verifyReadExtended(region, "extended_only"); - // TODO: fail verifyReadExtended(region, "extended_and_np"); + getRead(region, "boundary_equal"); + getRead(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -267,33 +426,19 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); - verifyReadNotPlaced(region, "extended_and_np"); + // TODO: fail getRead(region, "extended_only"); + // TODO: fail getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); - verifyReadPrimary(region, "simple20"); + getRead(region, "simple20"); // TODO: more tests and edge cases } - private void verifyReadPrimary(ActiveRegion region, String readName) { - SAMRecord read = getRead(region, readName); - Assert.assertFalse(read.getNotPrimaryAlignmentFlag(), "Read " + read + " not primary in active region " + region); - } - - private void verifyReadNonPrimary(ActiveRegion region, String readName) { - SAMRecord read = getRead(region, readName); - Assert.assertTrue(read.getNotPrimaryAlignmentFlag(), "Read " + read + " primary in active region " + region); - } - - private void verifyReadExtended(ActiveRegion region, String readName) { - Assert.fail("The Extended read state has not been implemented"); - } - private void verifyReadNotPlaced(ActiveRegion region, String readName) { for (SAMRecord read : region.getReads()) { if (read.getReadName().equals(readName)) Assert.fail("Read " + readName + " found in active region " + region); - } + } } private SAMRecord getRead(ActiveRegion region, String readName) { @@ -302,7 +447,7 @@ public class TraverseActiveRegionsTest extends BaseTest { return read; } - Assert.fail("Read " + readName + " not found in active region " + region); + Assert.fail("Read " + readName + " not assigned to active region " + region); return null; } From c76c808268596f12d81ca247c826653f4ffa2f56 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 28 Nov 2012 11:09:12 -0500 Subject: [PATCH 5/5] Reads are required to be sorted - Remove the extended_only case because it's outside intervals --- .../traversals/TraverseActiveRegionsTest.java | 42 +++++++------------ 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index b70085eff..a65b0cb45 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -25,6 +25,7 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -100,7 +101,7 @@ public class TraverseActiveRegionsTest extends BaseTest { private GenomeLocParser genomeLocParser; private List intervals; - private List reads; + private List reads; @BeforeClass private void init() throws FileNotFoundException { @@ -117,16 +118,18 @@ public class TraverseActiveRegionsTest extends BaseTest { intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); - reads = new ArrayList(); + reads = new ArrayList(); reads.add(buildSAMRecord("simple", "1", 100, 200)); reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); - reads.add(buildSAMRecord("extended_only", "1", 3000, 3100)); reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); + + // required by LocusIteratorByState, and I prefer to list them in test case order above + ReadUtils.sortReadsByCoordinate(reads); } @Test @@ -226,7 +229,6 @@ public class TraverseActiveRegionsTest extends BaseTest { // overlap_unequal: Primary in 1:1-999 // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // extended_only: Extended in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none // simple20: Primary in 20:10000-10100 @@ -241,7 +243,6 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -253,8 +254,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); getRead(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); - // TODO: fail getRead(region, "extended_and_np"); + getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -265,7 +265,6 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); getRead(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -277,7 +276,6 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); getRead(region, "simple20"); @@ -300,7 +298,6 @@ public class TraverseActiveRegionsTest extends BaseTest { // overlap_unequal: Primary in 1:1-999 // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // extended_only: Extended in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none // simple20: Primary in 20:10000-10100 @@ -315,8 +312,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); - // TODO: fail getRead(region, "extended_and_np"); + getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -327,8 +323,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); getRead(region, "boundary_equal"); getRead(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); - // TODO: fail getRead(region, "extended_and_np"); + getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -339,7 +334,6 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); getRead(region, "boundary_equal"); getRead(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -351,7 +345,6 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); getRead(region, "simple20"); @@ -375,7 +368,6 @@ public class TraverseActiveRegionsTest extends BaseTest { // overlap_unequal: Primary in 1:1-999 // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // extended_only: Extended in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none // simple20: Primary in 20:10000-10100 @@ -390,8 +382,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); - // TODO: fail getRead(region, "extended_and_np"); + getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -402,8 +393,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); getRead(region, "boundary_equal"); getRead(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); - // TODO: fail getRead(region, "extended_and_np"); + getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -414,8 +404,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); getRead(region, "boundary_equal"); getRead(region, "boundary_unequal"); - verifyReadNotPlaced(region, "extended_only"); - verifyReadNotPlaced(region, "extended_and_np"); + getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); verifyReadNotPlaced(region, "simple20"); @@ -426,8 +415,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "overlap_unequal"); verifyReadNotPlaced(region, "boundary_equal"); verifyReadNotPlaced(region, "boundary_unequal"); - // TODO: fail getRead(region, "extended_only"); - // TODO: fail getRead(region, "extended_and_np"); + verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); getRead(region, "simple20"); @@ -438,7 +426,7 @@ public class TraverseActiveRegionsTest extends BaseTest { for (SAMRecord read : region.getReads()) { if (read.getReadName().equals(readName)) Assert.fail("Read " + readName + " found in active region " + region); - } + } } private SAMRecord getRead(ActiveRegion region, String readName) { @@ -520,7 +508,7 @@ public class TraverseActiveRegionsTest extends BaseTest { engine.setGenomeLocParser(genomeLocParser); t.initialize(engine); - StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(reads); + StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList(reads)); Shard shard = new MockLocusShard(genomeLocParser, intervals); List providers = new ArrayList();