Fixed issue > 0 log likelihoods using GraphBased likelihood engine reported by Mauricio

Added some integration test to check on the fix
2013-12-09 08:18:56 -05:00 · 2013-12-09 08:18:56 -05:00 · 5db520c6fa
parent ab33db625f
commit 5db520c6fa
8 changed files with 108 additions and 20 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java
@ -52,9 +52,9 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
+import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.collections.CountSet;
-import org.broadinstitute.sting.utils.collections.CountSet;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.haplotype.Haplotype;
@ -117,6 +117,11 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
     */
    private final FlexibleHMM hmm;

+    /**
+     * Holds the log10 probability of passing from a indel to a match.
+     */
+    private final double indelToMatchTransitionLog10Probability;
+
    /**
     * Maximum likelihood difference between the reference haplotype and the best alternative haplotype.
     *
@ -146,6 +151,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
            throw new IllegalArgumentException("the global reading mismapping rate cannot be positive or zero");

        this.hmm = hmm;
+        this.indelToMatchTransitionLog10Probability = QualityUtils.qualToProbLog10(hmm.getGapExtensionPenalty());
        this.log10globalReadMismappingRate = log10globalReadMismappingRate;

        haplotypes = new ArrayList<>(assemblyResultSet.getHaplotypeList());
@ -302,9 +308,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
        for (final Haplotype haplotype : supportedHaplotypes)
            calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex);

-        //TODO Does not seem to be needed in practice:
-        //TODO furhter testing/evaluation required before removing it completely.
-        //makeLikelihoodAdjustment(alleleVersions, result, maxAlternativeAlleleLogLk.keySet(), maxAlternativeAlleleLogLk);
+        makeLikelihoodAdjustment(alleleVersions, result, maxAlleleLogLk.keySet(), maxAlleleLogLk);
        applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk);
        return result;
    }
@ -350,12 +354,12 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
        final List<ReadCost> readCosts = new ArrayList<>(readCostByRead.values());
        Collections.sort(readCosts, ReadCost.COMPARATOR);
        for (final ReadCost rc : readCosts)
-            result.add(rc.read, alleleVersions.get(haplotype), rc.cost);
+            result.add(rc.read, alleleVersions.get(haplotype), rc.getCost());

        for (final ReadCost rc : readCosts) {
            final Double currentMax = maxAlleleLogLk.get(rc.read);
-            if (currentMax == null || currentMax < rc.cost)
-                maxAlleleLogLk.put(rc.read, rc.cost);
+            if (currentMax == null || currentMax < rc.getCost())
+                maxAlleleLogLk.put(rc.read, rc.getCost());
        }
    }

@ -379,8 +383,8 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
                    continue;
                ReadCost rc = readCosts.get(pc.read);
                if (rc == null)
-                    readCosts.put(pc.read, rc = new ReadCost(pc.read));
-                rc.cost += pc.cost;
+                    readCosts.put(pc.read, rc = new ReadCost(pc.read,indelToMatchTransitionLog10Probability));
+                rc.addCost(pc.getCost());
            }
        }
    }
@ -641,7 +645,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
        // Calculate the costs.
        for (final ReadSegmentCost readSegmentCost : readSegmentCosts) {
            hmm.loadHaplotypeBases(readSegmentCost.bases);
-            readSegmentCost.cost = hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false);
+            readSegmentCost.setCost(hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false));
            if (prependVertex != null)
                readSegmentCost.path = new Route<>(prependVertex,readSegmentCost.path);
            if (appendVertex != null)
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java
@ -71,7 +71,7 @@ import java.util.*;
 public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculationEngine {
    private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class);

-    private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
+    public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual

    private final byte constantGCP;
    private final double log10globalReadMismappingRate;
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java
@ -58,10 +58,23 @@ public class ReadCost {
    /**
     * Holds the cost value. Public for convenience, please use with care.
     */
-    public double cost;
+    private double cost;

-    public ReadCost(final GATKSAMRecord r) {
+    /**
+     * Create a new read cost object provided the read and the gap extension penalty.
+     *
+     * @param r the read.
+     * @param initialCost the initial cost for the read before any read-segment alignment.
+     *
+     * @throws NullPointerException if {@code r} is {@code null}.
+     * @throws IllegalArgumentException if {@code initialCost} is not a valid likelihood.
+     */
+    public ReadCost(final GATKSAMRecord r, final double initialCost) {
+        if (r == null) throw new NullPointerException();
+        if (Double.isNaN(initialCost) || Double.isInfinite(initialCost) || initialCost > 0)
+            throw new IllegalArgumentException("initial cost must be a finite 0 or negative value (" + initialCost + ")");
        read = r;
+        cost = initialCost;
    }


@ -77,4 +90,23 @@ public class ReadCost {
        }
    };

+
+    /**
+     * Add to the cost.
+     * @param value value to add.
+     */
+    public void addCost(final double value) {
+        if (cost + value > 0)
+            throw new IllegalArgumentException("value brings cost over 0. Current cost " + cost + " value " + value);
+        cost += value;
+    }
+
+    /**
+     * Return cost.
+     * @return 0 or less.
+     */
+    public double getCost() {
+        return cost;
+    }
+
 }
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java
@ -69,7 +69,7 @@ class ReadSegmentCost {
    /**
     * Holds the cost value. It public and non-final for convenience.
     */
-    protected double cost;
+    private double cost;

    /**
     * Caches the path bases (the haplotype segment bases).
@ -87,7 +87,15 @@ class ReadSegmentCost {
                    final Route<MultiDeBruijnVertex, MultiSampleEdge> path, double cost) {
        this.read = read;
        this.path = path;
-        this.cost = cost;
+        setCost(cost);
+    }
+
+    public double getCost() {
+        return cost;
+    }
+
+    public void setCost(final double value) {
+        cost = value;
    }

    /**
--- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java
@ -46,6 +46,7 @@

 package org.broadinstitute.sting.utils.pairhmm;

+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine;
 import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;

@ -148,6 +149,11 @@ public class FastLoglessPairHMM extends LoglessPairHMM  implements FlexibleHMM {
        initialize(readCapacity,haplotypeCapacity);
    }

+    @Override
+    public byte getGapExtensionPenalty() {
+        return constantGCP;
+    }
+

    @Override
    public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases,
@ -384,9 +390,10 @@ public class FastLoglessPairHMM extends LoglessPairHMM  implements FlexibleHMM {
        for (int kkk = 0; kkk < readQuals.length; kkk++) {
            readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk],
                    mq); // cap base quality by mapping
-            // TODO -- why is Q18 hard-coded here???
-            readQuals[kkk] = (readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE
-                    : readQuals[kkk]);
+            readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE
+                    : Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk]));
+            readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]);
+            readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]);
        }
        this.readBases = readBases;
        this.readQuals = readQuals;
@ -691,7 +698,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM  implements FlexibleHMM {
    private double calculateLocalLikelihoodGeneral(final Problem p) {

        initializeMatrixValues(p,null);
-        int fromCol = p.hapStart + 1;
+     // int fromCol = p.hapStart + 1;
     //   if (previousProblem == null) {
     //       fromCol = p.hapStart + 1;
     //   } else {
@ -704,7 +711,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM  implements FlexibleHMM {
     //   previousProblem = p;

        updateTable(p.readStart + 1, p.readEnd + 1,
-                fromCol, p.hapEnd + 1);
+                p.hapStart + 1, p.hapEnd + 1);

        if (p.trailing) {
            return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1)
--- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java
@ -97,4 +97,9 @@ public interface FlexibleHMM {
    public void loadRead(byte[] bases, byte[] bq, byte[] iq, byte[] dq, int mq);


+    /**
+     * Returns the constant gap extension penalty in Phred scale
+     * @return never @code null.
+     */
+    byte getGapExtensionPenalty();
 }
--- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
@ -166,6 +166,11 @@ public class LoglessPairHMM extends N2MemoryPairHMM {
            transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]);
            transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]);
            transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]);
+            //TODO it seems that it is not always the case that matchToMatch + matchToDeletion + matchToInsertion == 1.
+            //TODO We have detected cases of 1.00002 which can cause problems downstream. This are typically masked
+            //TODO by the fact that we always add a indelToMatch penalty to all PairHMM likelihoods (~ -0.1)
+            //TODO This is in fact not well justified and although it does not have any effect (since is equally added to all
+            //TODO haplotypes likelihoods) perhaps we should just remove it eventually and fix this != 1.0 issue here.
        }
    }

--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@ -72,6 +72,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
    final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
    final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
    final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam";
+    final static String NA12878_PCRFREE250_ADAPTER_TRIMMED = privateTestDir + "PCRFree.2x250.b37_decoy.NA12878.adapter_trimmed-10000000-11000000.bam";
    final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam";
    final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";

@ -256,6 +257,32 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
        executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
    }

+    @Test
+    public void HCTestDBSNPAnnotationWGSGraphBased() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
+                Arrays.asList("a6c4d5d2eece2bd2c51a81e34e80040f"));
+        executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
+    }
+
+    @Test
+    public void HCTestDBSNPAnnotationWExGraphBased() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+                        + " -L " + hg19Intervals + " -isr INTERSECTION", 1,
+                Arrays.asList("69db1045b5445a4f90843f368bd62814"));
+        executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
+    }
+
+    @Test
+    public void HCTestGraphBasedPCRFreePositiveLogLkFix() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,000,000-11,000,000 "
+                        , 1,
+                Arrays.asList(""));
+        executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
+    }
+
    // --------------------------------------------------------------------------------------------------------------
    //
    // test PCR indel model