From 5db520c6fa05c86ca0f9a48bba355cf41e782d98 Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Mon, 9 Dec 2013 08:18:56 -0500 Subject: [PATCH] Fixed issue > 0 log likelihoods using GraphBased likelihood engine reported by Mauricio Added some integration test to check on the fix --- ...edLikelihoodCalculationEngineInstance.java | 24 +++++++------ .../PairHMMLikelihoodCalculationEngine.java | 2 +- .../walkers/haplotypecaller/ReadCost.java | 36 +++++++++++++++++-- .../haplotypecaller/ReadSegmentCost.java | 12 +++++-- .../utils/pairhmm/FastLoglessPairHMM.java | 17 ++++++--- .../sting/utils/pairhmm/FlexibleHMM.java | 5 +++ .../sting/utils/pairhmm/LoglessPairHMM.java | 5 +++ .../HaplotypeCallerIntegrationTest.java | 27 ++++++++++++++ 8 files changed, 108 insertions(+), 20 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java index 9d861a445..66ea7be03 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java @@ -52,9 +52,9 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.CountSet; -import org.broadinstitute.sting.utils.collections.CountSet; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; @@ -117,6 +117,11 @@ public class GraphBasedLikelihoodCalculationEngineInstance { */ private final FlexibleHMM hmm; + /** + * Holds the log10 probability of passing from a indel to a match. + */ + private final double indelToMatchTransitionLog10Probability; + /** * Maximum likelihood difference between the reference haplotype and the best alternative haplotype. * @@ -146,6 +151,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance { throw new IllegalArgumentException("the global reading mismapping rate cannot be positive or zero"); this.hmm = hmm; + this.indelToMatchTransitionLog10Probability = QualityUtils.qualToProbLog10(hmm.getGapExtensionPenalty()); this.log10globalReadMismappingRate = log10globalReadMismappingRate; haplotypes = new ArrayList<>(assemblyResultSet.getHaplotypeList()); @@ -302,9 +308,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance { for (final Haplotype haplotype : supportedHaplotypes) calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex); - //TODO Does not seem to be needed in practice: - //TODO furhter testing/evaluation required before removing it completely. - //makeLikelihoodAdjustment(alleleVersions, result, maxAlternativeAlleleLogLk.keySet(), maxAlternativeAlleleLogLk); + makeLikelihoodAdjustment(alleleVersions, result, maxAlleleLogLk.keySet(), maxAlleleLogLk); applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk); return result; } @@ -350,12 +354,12 @@ public class GraphBasedLikelihoodCalculationEngineInstance { final List readCosts = new ArrayList<>(readCostByRead.values()); Collections.sort(readCosts, ReadCost.COMPARATOR); for (final ReadCost rc : readCosts) - result.add(rc.read, alleleVersions.get(haplotype), rc.cost); + result.add(rc.read, alleleVersions.get(haplotype), rc.getCost()); for (final ReadCost rc : readCosts) { final Double currentMax = maxAlleleLogLk.get(rc.read); - if (currentMax == null || currentMax < rc.cost) - maxAlleleLogLk.put(rc.read, rc.cost); + if (currentMax == null || currentMax < rc.getCost()) + maxAlleleLogLk.put(rc.read, rc.getCost()); } } @@ -379,8 +383,8 @@ public class GraphBasedLikelihoodCalculationEngineInstance { continue; ReadCost rc = readCosts.get(pc.read); if (rc == null) - readCosts.put(pc.read, rc = new ReadCost(pc.read)); - rc.cost += pc.cost; + readCosts.put(pc.read, rc = new ReadCost(pc.read,indelToMatchTransitionLog10Probability)); + rc.addCost(pc.getCost()); } } } @@ -641,7 +645,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance { // Calculate the costs. for (final ReadSegmentCost readSegmentCost : readSegmentCosts) { hmm.loadHaplotypeBases(readSegmentCost.bases); - readSegmentCost.cost = hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false); + readSegmentCost.setCost(hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false)); if (prependVertex != null) readSegmentCost.path = new Route<>(prependVertex,readSegmentCost.path); if (appendVertex != null) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index 6429e7820..b1db23d74 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -71,7 +71,7 @@ import java.util.*; public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculationEngine { private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class); - private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual + public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual private final byte constantGCP; private final double log10globalReadMismappingRate; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java index a344eea61..d5f62a6a3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java @@ -58,10 +58,23 @@ public class ReadCost { /** * Holds the cost value. Public for convenience, please use with care. */ - public double cost; + private double cost; - public ReadCost(final GATKSAMRecord r) { + /** + * Create a new read cost object provided the read and the gap extension penalty. + * + * @param r the read. + * @param initialCost the initial cost for the read before any read-segment alignment. + * + * @throws NullPointerException if {@code r} is {@code null}. + * @throws IllegalArgumentException if {@code initialCost} is not a valid likelihood. + */ + public ReadCost(final GATKSAMRecord r, final double initialCost) { + if (r == null) throw new NullPointerException(); + if (Double.isNaN(initialCost) || Double.isInfinite(initialCost) || initialCost > 0) + throw new IllegalArgumentException("initial cost must be a finite 0 or negative value (" + initialCost + ")"); read = r; + cost = initialCost; } @@ -77,4 +90,23 @@ public class ReadCost { } }; + + /** + * Add to the cost. + * @param value value to add. + */ + public void addCost(final double value) { + if (cost + value > 0) + throw new IllegalArgumentException("value brings cost over 0. Current cost " + cost + " value " + value); + cost += value; + } + + /** + * Return cost. + * @return 0 or less. + */ + public double getCost() { + return cost; + } + } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java index 9020e3426..b5544f1a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java @@ -69,7 +69,7 @@ class ReadSegmentCost { /** * Holds the cost value. It public and non-final for convenience. */ - protected double cost; + private double cost; /** * Caches the path bases (the haplotype segment bases). @@ -87,7 +87,15 @@ class ReadSegmentCost { final Route path, double cost) { this.read = read; this.path = path; - this.cost = cost; + setCost(cost); + } + + public double getCost() { + return cost; + } + + public void setCost(final double value) { + cost = value; } /** diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java index 2872bea37..fb9dda8b2 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.utils.pairhmm; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -148,6 +149,11 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { initialize(readCapacity,haplotypeCapacity); } + @Override + public byte getGapExtensionPenalty() { + return constantGCP; + } + @Override public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases, @@ -384,9 +390,10 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { for (int kkk = 0; kkk < readQuals.length; kkk++) { readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk], mq); // cap base quality by mapping - // TODO -- why is Q18 hard-coded here??? - readQuals[kkk] = (readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE - : readQuals[kkk]); + readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE + : Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk])); + readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]); + readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]); } this.readBases = readBases; this.readQuals = readQuals; @@ -691,7 +698,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { private double calculateLocalLikelihoodGeneral(final Problem p) { initializeMatrixValues(p,null); - int fromCol = p.hapStart + 1; + // int fromCol = p.hapStart + 1; // if (previousProblem == null) { // fromCol = p.hapStart + 1; // } else { @@ -704,7 +711,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { // previousProblem = p; updateTable(p.readStart + 1, p.readEnd + 1, - fromCol, p.hapEnd + 1); + p.hapStart + 1, p.hapEnd + 1); if (p.trailing) { return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1) diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java index d3d003731..152274947 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java @@ -97,4 +97,9 @@ public interface FlexibleHMM { public void loadRead(byte[] bases, byte[] bq, byte[] iq, byte[] dq, int mq); + /** + * Returns the constant gap extension penalty in Phred scale + * @return never @code null. + */ + byte getGapExtensionPenalty(); } diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index 1feae2bfe..0725e24b4 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -166,6 +166,11 @@ public class LoglessPairHMM extends N2MemoryPairHMM { transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]); transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]); transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]); + //TODO it seems that it is not always the case that matchToMatch + matchToDeletion + matchToInsertion == 1. + //TODO We have detected cases of 1.00002 which can cause problems downstream. This are typically masked + //TODO by the fact that we always add a indelToMatch penalty to all PairHMM likelihoods (~ -0.1) + //TODO This is in fact not well justified and although it does not have any effect (since is equally added to all + //TODO haplotypes likelihoods) perhaps we should just remove it eventually and fix this != 1.0 issue here. } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index c27208194..dfbbd7084 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -72,6 +72,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; + final static String NA12878_PCRFREE250_ADAPTER_TRIMMED = privateTestDir + "PCRFree.2x250.b37_decoy.NA12878.adapter_trimmed-10000000-11000000.bam"; final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam"; final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; @@ -256,6 +257,32 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } + @Test + public void HCTestDBSNPAnnotationWGSGraphBased() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("a6c4d5d2eece2bd2c51a81e34e80040f")); + executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); + } + + @Test + public void HCTestDBSNPAnnotationWExGraphBased() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + + " -L " + hg19Intervals + " -isr INTERSECTION", 1, + Arrays.asList("69db1045b5445a4f90843f368bd62814")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + + @Test + public void HCTestGraphBasedPCRFreePositiveLogLkFix() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,000,000-11,000,000 " + , 1, + Arrays.asList("")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + // -------------------------------------------------------------------------------------------------------------- // // test PCR indel model