diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 3d7ea438e..0b95ed07e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -472,6 +472,14 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false) protected int PADDING_AROUND_SNPS_FOR_CALLING = 20; + /** + * Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base + * insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator). + */ + @Advanced + @Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false) + public LikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE; + // ----------------------------------------------------------------------------------------------- // done with Haplotype caller parameters // ----------------------------------------------------------------------------------------------- @@ -624,7 +632,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } // create our likelihood calculation engine - likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga ); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 8bbe23afc..e6e76ba90 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -63,6 +63,8 @@ import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM; import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM; import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; @@ -76,6 +78,8 @@ import java.util.*; public class LikelihoodCalculationEngine { private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class); + private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual + private final byte constantGCP; private final double log10globalReadMismappingRate; private final boolean DEBUG; @@ -104,6 +108,17 @@ public class LikelihoodCalculationEngine { private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; private final PrintStream likelihoodsStream; + public enum PCR_ERROR_MODEL { + /** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */ + NONE, + /** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */ + AGGRESSIVE, + /** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */ + CONSERVATIVE + } + + private final PCR_ERROR_MODEL pcrErrorModel; + /** * The expected rate of random sequencing errors for a read originating from its true haplotype. * @@ -127,12 +142,15 @@ public class LikelihoodCalculationEngine { * assigned a likelihood of -13. * @param noFpga disable FPGA acceleration */ - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga ) { + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) { this.hmmType = hmmType; this.constantGCP = constantGCP; this.DEBUG = debug; this.log10globalReadMismappingRate = log10globalReadMismappingRate; this.noFpga = noFpga; + this.pcrErrorModel = pcrErrorModel; + + initializePCRErrorModel(); if ( WRITE_LIKELIHOODS_TO_FILE ) { try { @@ -145,20 +163,10 @@ public class LikelihoodCalculationEngine { } } - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate ) { - this(constantGCP, debug, hmmType, log10globalReadMismappingRate, false); - } - - public LikelihoodCalculationEngine() { - this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, false); - } - public void close() { if ( likelihoodsStream != null ) likelihoodsStream.close(); } - - /** * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate * @@ -196,11 +204,7 @@ public class LikelihoodCalculationEngine { // evaluate the likelihood of the reads given those haplotypes final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); - final List removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); -// logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey()); -// for ( final GATKSAMRecord read : removedReads ) -// logger.info("\tRemoved " + read.getReadName()); - + map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); stratifiedReadMap.put(sampleEntry.getKey(), map); } @@ -222,22 +226,27 @@ public class LikelihoodCalculationEngine { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); for( final GATKSAMRecord read : reads ) { + + final byte[] readBases = read.getReadBases(); final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); + final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + + applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + for( int kkk = 0; kkk < readQuals.length; kkk++ ) { readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated - //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated - // TODO -- why is Q18 hard-coded here??? - readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); + readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); } if ( batchPairHMM != null ) { - batchPairHMM.batchAdd(haplotypes, read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP); + batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); batchedReads.add(read); continue; } @@ -251,12 +260,12 @@ public class LikelihoodCalculationEngine { final Haplotype haplotype = haplotypes.get(jjj); final boolean isFirstHaplotype = jjj == 0; final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), - read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype); + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype); if ( WRITE_LIKELIHOODS_TO_FILE ) { likelihoodsStream.printf("%s %s %s %s %s %s %f%n", haplotype.getBaseString(), - new String(read.getReadBases()), + new String(readBases), SAMUtils.phredToFastq(readQuals), SAMUtils.phredToFastq(readInsQuals), SAMUtils.phredToFastq(readDelQuals), @@ -524,4 +533,48 @@ public class LikelihoodCalculationEngine { } throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); } + + // -------------------------------------------------------------------------------- + // + // Experimental attempts at PCR error rate modeling + // + // -------------------------------------------------------------------------------- + + protected static final int MAX_STR_UNIT_LENGTH = 8; + protected static final int MAX_REPEAT_LENGTH = 20; + protected static final int MIN_ADJUSTED_QSCORE = 10; + protected static final double INITIAL_QSCORE = 40.0; + + private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1]; + private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); + + private void initializePCRErrorModel() { + if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) + return; + + repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH); + + pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1]; + + final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0; + + for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ ) + pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor); + } + + protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) { + return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 )); + } + + protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) { + if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) + return; + + for ( int iii = 1; iii < readBases.length; iii++ ) { + final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond(); + readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); + readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); + } + } + } diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java index 9672bc5f3..943e87461 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java @@ -74,6 +74,11 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH; } + public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) { + this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH; + this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH; + } + @Override public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { // store the original bases and then write Ns over low quality ones @@ -103,7 +108,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { } - private Pair findTandemRepeatUnits(byte[] readBases, int offset) { + public Pair findTandemRepeatUnits(byte[] readBases, int offset) { int maxBW = 0; byte[] bestBWRepeatUnit = new byte[]{readBases[offset]}; for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 38ed73c40..cdfacaaf3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -57,7 +57,7 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); } @@ -68,7 +68,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } private void HCTestSymbolicVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); } @@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } private void HCTestComplexGGA(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 4ef32fef1..e73c04d2c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -79,7 +79,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { */ @Test(dataProvider = "MyDataProvider") public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { - final String commandLine = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s %s -ERC %s --no_cmdline_in_header", + final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header", b37KGReference, bam, intervals, mode); final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); @@ -88,7 +88,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @Test public void testERCRegionWithNoCalledHaplotypes() { - final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF", + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF", b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); spec.disableShadowBCF(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index e3f7bd706..a2e5142e5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -76,7 +76,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; private void HCTest(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCaller: args=" + args, spec); } @@ -108,7 +108,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } private void HCTestIndelQualityScores(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); } @@ -123,7 +123,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { if( containsDuplicateRecord(vcf, parser) ) { @@ -161,14 +161,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { // any of the calls in that region because it is so messy. @Test public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); executeTest("HCTestStructuralIndels: ", spec); } @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDanglingTailMergingForDeletions() throws IOException { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0); @@ -210,7 +210,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -218,7 +218,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, + "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, Arrays.asList("6a9222905c740b9208bf3c67478514eb")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } @@ -232,7 +232,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, Arrays.asList("f3e636d64042e766cc6515987e85a968")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -240,9 +240,31 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDBSNPAnnotationWEx() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, Arrays.asList("1352cbe1404aefc94eb8e044539a9882")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // test PCR indel model + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestAggressivePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("ab49f80783e5db5f9ab6b13ba2ad00cb")); + executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); + } + + @Test + public void HCTestConservativePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("16f7ffa063511c70bad795639a1c2638")); + executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index 8937e8868..3402b73f0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -70,7 +70,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { @Test(dataProvider = "NCTDataProvider") public void testHCNCT(final int nct, final String md5) { WalkerTestSpec spec = new WalkerTestSpec( - "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + "-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " + " -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1, Arrays.asList(md5)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java index 48c9d3c1a..38b00ab07 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java @@ -54,9 +54,18 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + /** * Unit tests for LikelihoodCalculationEngine */ @@ -93,6 +102,45 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest { Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2)); } + @DataProvider(name = "PcrErrorModelTestProvider") + public Object[][] createPcrErrorModelTestData() { + List tests = new ArrayList(); + + for ( final String repeat : Arrays.asList("A", "AC", "ACG", "ACGT") ) { + for ( final int repeatLength : Arrays.asList(1, 2, 3, 5, 10, 15) ) { + tests.add(new Object[]{repeat, repeatLength}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PcrErrorModelTestProvider", enabled = true) + public void createPcrErrorModelTest(final String repeat, final int repeatLength) { + + final LikelihoodCalculationEngine engine = new LikelihoodCalculationEngine((byte)0, false, PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE); + + final String readString = Utils.dupString(repeat, repeatLength); + final byte[] insQuals = new byte[readString.length()]; + final byte[] delQuals = new byte[readString.length()]; + Arrays.fill(insQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE); + Arrays.fill(delQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE); + + engine.applyPCRErrorModel(readString.getBytes(), insQuals, delQuals); + + final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); + repeatCovariate.initialize(LikelihoodCalculationEngine.MAX_STR_UNIT_LENGTH, LikelihoodCalculationEngine.MAX_REPEAT_LENGTH); + + for ( int i = 1; i < insQuals.length; i++ ) { + + final int repeatLengthFromCovariate = repeatCovariate.findTandemRepeatUnits(readString.getBytes(), i-1).getSecond(); + final byte adjustedScore = LikelihoodCalculationEngine.getErrorModelAdjustedQual(repeatLengthFromCovariate, 3.0); + + Assert.assertEquals(insQuals[i-1], adjustedScore); + Assert.assertEquals(delQuals[i-1], adjustedScore); + } + } + // BUGBUG: LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods has changed! Need to make new unit tests! /* private class BasicLikelihoodTestProvider extends TestDataProvider {