From 17eb7b49fe0fc4e90ee774b6d3f7afa4ae6179bc Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 12 Aug 2013 13:08:55 -0400 Subject: [PATCH] Adding ability to use Ryan's PCR error modeling to the Haplotype Caller. There is now a command-line option to set the model to use in the HC. Incorporated Ryan's current (unmerged) branch in for most of these changes. Because small changes to the math can have drastic effects, I decided not to let users tweak the calculations themselves. Instead they can select either NONE, CONSERVATIVE (the default), or AGGRESSIVE. Note that any base insertion/deletion qualities from BQSR are still used. Also, note that the repeat unit x repeat length approach gave very poor results against the KB, so it is not included as an option here. --- .../haplotypecaller/HaplotypeCaller.java | 10 +- .../LikelihoodCalculationEngine.java | 103 +++++++++++++----- .../covariates/RepeatCovariate.java | 7 +- ...lexAndSymbolicVariantsIntegrationTest.java | 6 +- .../HaplotypeCallerGVCFIntegrationTest.java | 4 +- .../HaplotypeCallerIntegrationTest.java | 42 +++++-- ...aplotypeCallerParallelIntegrationTest.java | 2 +- .../LikelihoodCalculationEngineUnitTest.java | 48 ++++++++ 8 files changed, 179 insertions(+), 43 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 3d7ea438e..0b95ed07e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -472,6 +472,14 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false) protected int PADDING_AROUND_SNPS_FOR_CALLING = 20; + /** + * Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base + * insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator). + */ + @Advanced + @Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false) + public LikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE; + // ----------------------------------------------------------------------------------------------- // done with Haplotype caller parameters // ----------------------------------------------------------------------------------------------- @@ -624,7 +632,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } // create our likelihood calculation engine - likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga ); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 8bbe23afc..e6e76ba90 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -63,6 +63,8 @@ import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM; import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM; import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; @@ -76,6 +78,8 @@ import java.util.*; public class LikelihoodCalculationEngine { private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class); + private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual + private final byte constantGCP; private final double log10globalReadMismappingRate; private final boolean DEBUG; @@ -104,6 +108,17 @@ public class LikelihoodCalculationEngine { private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; private final PrintStream likelihoodsStream; + public enum PCR_ERROR_MODEL { + /** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */ + NONE, + /** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */ + AGGRESSIVE, + /** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */ + CONSERVATIVE + } + + private final PCR_ERROR_MODEL pcrErrorModel; + /** * The expected rate of random sequencing errors for a read originating from its true haplotype. * @@ -127,12 +142,15 @@ public class LikelihoodCalculationEngine { * assigned a likelihood of -13. * @param noFpga disable FPGA acceleration */ - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga ) { + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) { this.hmmType = hmmType; this.constantGCP = constantGCP; this.DEBUG = debug; this.log10globalReadMismappingRate = log10globalReadMismappingRate; this.noFpga = noFpga; + this.pcrErrorModel = pcrErrorModel; + + initializePCRErrorModel(); if ( WRITE_LIKELIHOODS_TO_FILE ) { try { @@ -145,20 +163,10 @@ public class LikelihoodCalculationEngine { } } - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate ) { - this(constantGCP, debug, hmmType, log10globalReadMismappingRate, false); - } - - public LikelihoodCalculationEngine() { - this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, false); - } - public void close() { if ( likelihoodsStream != null ) likelihoodsStream.close(); } - - /** * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate * @@ -196,11 +204,7 @@ public class LikelihoodCalculationEngine { // evaluate the likelihood of the reads given those haplotypes final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); - final List removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); -// logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey()); -// for ( final GATKSAMRecord read : removedReads ) -// logger.info("\tRemoved " + read.getReadName()); - + map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); stratifiedReadMap.put(sampleEntry.getKey(), map); } @@ -222,22 +226,27 @@ public class LikelihoodCalculationEngine { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); for( final GATKSAMRecord read : reads ) { + + final byte[] readBases = read.getReadBases(); final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); + final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + + applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + for( int kkk = 0; kkk < readQuals.length; kkk++ ) { readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated - //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated - // TODO -- why is Q18 hard-coded here??? - readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); + readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); } if ( batchPairHMM != null ) { - batchPairHMM.batchAdd(haplotypes, read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP); + batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); batchedReads.add(read); continue; } @@ -251,12 +260,12 @@ public class LikelihoodCalculationEngine { final Haplotype haplotype = haplotypes.get(jjj); final boolean isFirstHaplotype = jjj == 0; final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), - read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype); + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype); if ( WRITE_LIKELIHOODS_TO_FILE ) { likelihoodsStream.printf("%s %s %s %s %s %s %f%n", haplotype.getBaseString(), - new String(read.getReadBases()), + new String(readBases), SAMUtils.phredToFastq(readQuals), SAMUtils.phredToFastq(readInsQuals), SAMUtils.phredToFastq(readDelQuals), @@ -524,4 +533,48 @@ public class LikelihoodCalculationEngine { } throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); } + + // -------------------------------------------------------------------------------- + // + // Experimental attempts at PCR error rate modeling + // + // -------------------------------------------------------------------------------- + + protected static final int MAX_STR_UNIT_LENGTH = 8; + protected static final int MAX_REPEAT_LENGTH = 20; + protected static final int MIN_ADJUSTED_QSCORE = 10; + protected static final double INITIAL_QSCORE = 40.0; + + private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1]; + private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); + + private void initializePCRErrorModel() { + if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) + return; + + repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH); + + pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1]; + + final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0; + + for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ ) + pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor); + } + + protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) { + return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 )); + } + + protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) { + if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) + return; + + for ( int iii = 1; iii < readBases.length; iii++ ) { + final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond(); + readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); + readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); + } + } + } diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java index 9672bc5f3..943e87461 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java @@ -74,6 +74,11 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH; } + public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) { + this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH; + this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH; + } + @Override public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { // store the original bases and then write Ns over low quality ones @@ -103,7 +108,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { } - private Pair findTandemRepeatUnits(byte[] readBases, int offset) { + public Pair findTandemRepeatUnits(byte[] readBases, int offset) { int maxBW = 0; byte[] bestBWRepeatUnit = new byte[]{readBases[offset]}; for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 38ed73c40..cdfacaaf3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -57,7 +57,7 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); } @@ -68,7 +68,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } private void HCTestSymbolicVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); } @@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } private void HCTestComplexGGA(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 4ef32fef1..e73c04d2c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -79,7 +79,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { */ @Test(dataProvider = "MyDataProvider") public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { - final String commandLine = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s %s -ERC %s --no_cmdline_in_header", + final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header", b37KGReference, bam, intervals, mode); final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); @@ -88,7 +88,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @Test public void testERCRegionWithNoCalledHaplotypes() { - final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF", + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF", b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); spec.disableShadowBCF(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index e3f7bd706..a2e5142e5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -76,7 +76,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; private void HCTest(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCaller: args=" + args, spec); } @@ -108,7 +108,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } private void HCTestIndelQualityScores(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); } @@ -123,7 +123,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { if( containsDuplicateRecord(vcf, parser) ) { @@ -161,14 +161,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { // any of the calls in that region because it is so messy. @Test public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); executeTest("HCTestStructuralIndels: ", spec); } @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDanglingTailMergingForDeletions() throws IOException { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0); @@ -210,7 +210,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -218,7 +218,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, + "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, Arrays.asList("6a9222905c740b9208bf3c67478514eb")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } @@ -232,7 +232,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, Arrays.asList("f3e636d64042e766cc6515987e85a968")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -240,9 +240,31 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDBSNPAnnotationWEx() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, Arrays.asList("1352cbe1404aefc94eb8e044539a9882")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // test PCR indel model + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestAggressivePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("ab49f80783e5db5f9ab6b13ba2ad00cb")); + executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); + } + + @Test + public void HCTestConservativePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("16f7ffa063511c70bad795639a1c2638")); + executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index 8937e8868..3402b73f0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -70,7 +70,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { @Test(dataProvider = "NCTDataProvider") public void testHCNCT(final int nct, final String md5) { WalkerTestSpec spec = new WalkerTestSpec( - "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + "-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " + " -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1, Arrays.asList(md5)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java index 48c9d3c1a..38b00ab07 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java @@ -54,9 +54,18 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + /** * Unit tests for LikelihoodCalculationEngine */ @@ -93,6 +102,45 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest { Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2)); } + @DataProvider(name = "PcrErrorModelTestProvider") + public Object[][] createPcrErrorModelTestData() { + List tests = new ArrayList(); + + for ( final String repeat : Arrays.asList("A", "AC", "ACG", "ACGT") ) { + for ( final int repeatLength : Arrays.asList(1, 2, 3, 5, 10, 15) ) { + tests.add(new Object[]{repeat, repeatLength}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PcrErrorModelTestProvider", enabled = true) + public void createPcrErrorModelTest(final String repeat, final int repeatLength) { + + final LikelihoodCalculationEngine engine = new LikelihoodCalculationEngine((byte)0, false, PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE); + + final String readString = Utils.dupString(repeat, repeatLength); + final byte[] insQuals = new byte[readString.length()]; + final byte[] delQuals = new byte[readString.length()]; + Arrays.fill(insQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE); + Arrays.fill(delQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE); + + engine.applyPCRErrorModel(readString.getBytes(), insQuals, delQuals); + + final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); + repeatCovariate.initialize(LikelihoodCalculationEngine.MAX_STR_UNIT_LENGTH, LikelihoodCalculationEngine.MAX_REPEAT_LENGTH); + + for ( int i = 1; i < insQuals.length; i++ ) { + + final int repeatLengthFromCovariate = repeatCovariate.findTandemRepeatUnits(readString.getBytes(), i-1).getSecond(); + final byte adjustedScore = LikelihoodCalculationEngine.getErrorModelAdjustedQual(repeatLengthFromCovariate, 3.0); + + Assert.assertEquals(insQuals[i-1], adjustedScore); + Assert.assertEquals(delQuals[i-1], adjustedScore); + } + } + // BUGBUG: LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods has changed! Need to make new unit tests! /* private class BasicLikelihoodTestProvider extends TestDataProvider {