Fixed issue > 0 log likelihoods using GraphBased likelihood engine reported by Mauricio

Added some integration test to check on the fix
This commit is contained in:
Valentin Ruano-Rubio 2013-12-09 08:18:56 -05:00
parent ab33db625f
commit 5db520c6fa
8 changed files with 108 additions and 20 deletions

View File

@ -52,9 +52,9 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.CountSet;
import org.broadinstitute.sting.utils.collections.CountSet;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
@ -117,6 +117,11 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
*/
private final FlexibleHMM hmm;
/**
* Holds the log10 probability of passing from a indel to a match.
*/
private final double indelToMatchTransitionLog10Probability;
/**
* Maximum likelihood difference between the reference haplotype and the best alternative haplotype.
*
@ -146,6 +151,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
throw new IllegalArgumentException("the global reading mismapping rate cannot be positive or zero");
this.hmm = hmm;
this.indelToMatchTransitionLog10Probability = QualityUtils.qualToProbLog10(hmm.getGapExtensionPenalty());
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
haplotypes = new ArrayList<>(assemblyResultSet.getHaplotypeList());
@ -302,9 +308,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
for (final Haplotype haplotype : supportedHaplotypes)
calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex);
//TODO Does not seem to be needed in practice:
//TODO furhter testing/evaluation required before removing it completely.
//makeLikelihoodAdjustment(alleleVersions, result, maxAlternativeAlleleLogLk.keySet(), maxAlternativeAlleleLogLk);
makeLikelihoodAdjustment(alleleVersions, result, maxAlleleLogLk.keySet(), maxAlleleLogLk);
applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk);
return result;
}
@ -350,12 +354,12 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
final List<ReadCost> readCosts = new ArrayList<>(readCostByRead.values());
Collections.sort(readCosts, ReadCost.COMPARATOR);
for (final ReadCost rc : readCosts)
result.add(rc.read, alleleVersions.get(haplotype), rc.cost);
result.add(rc.read, alleleVersions.get(haplotype), rc.getCost());
for (final ReadCost rc : readCosts) {
final Double currentMax = maxAlleleLogLk.get(rc.read);
if (currentMax == null || currentMax < rc.cost)
maxAlleleLogLk.put(rc.read, rc.cost);
if (currentMax == null || currentMax < rc.getCost())
maxAlleleLogLk.put(rc.read, rc.getCost());
}
}
@ -379,8 +383,8 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
continue;
ReadCost rc = readCosts.get(pc.read);
if (rc == null)
readCosts.put(pc.read, rc = new ReadCost(pc.read));
rc.cost += pc.cost;
readCosts.put(pc.read, rc = new ReadCost(pc.read,indelToMatchTransitionLog10Probability));
rc.addCost(pc.getCost());
}
}
}
@ -641,7 +645,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
// Calculate the costs.
for (final ReadSegmentCost readSegmentCost : readSegmentCosts) {
hmm.loadHaplotypeBases(readSegmentCost.bases);
readSegmentCost.cost = hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false);
readSegmentCost.setCost(hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false));
if (prependVertex != null)
readSegmentCost.path = new Route<>(prependVertex,readSegmentCost.path);
if (appendVertex != null)

View File

@ -71,7 +71,7 @@ import java.util.*;
public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculationEngine {
private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class);
private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
private final byte constantGCP;
private final double log10globalReadMismappingRate;

View File

@ -58,10 +58,23 @@ public class ReadCost {
/**
* Holds the cost value. Public for convenience, please use with care.
*/
public double cost;
private double cost;
public ReadCost(final GATKSAMRecord r) {
/**
* Create a new read cost object provided the read and the gap extension penalty.
*
* @param r the read.
* @param initialCost the initial cost for the read before any read-segment alignment.
*
* @throws NullPointerException if {@code r} is {@code null}.
* @throws IllegalArgumentException if {@code initialCost} is not a valid likelihood.
*/
public ReadCost(final GATKSAMRecord r, final double initialCost) {
if (r == null) throw new NullPointerException();
if (Double.isNaN(initialCost) || Double.isInfinite(initialCost) || initialCost > 0)
throw new IllegalArgumentException("initial cost must be a finite 0 or negative value (" + initialCost + ")");
read = r;
cost = initialCost;
}
@ -77,4 +90,23 @@ public class ReadCost {
}
};
/**
* Add to the cost.
* @param value value to add.
*/
public void addCost(final double value) {
if (cost + value > 0)
throw new IllegalArgumentException("value brings cost over 0. Current cost " + cost + " value " + value);
cost += value;
}
/**
* Return cost.
* @return 0 or less.
*/
public double getCost() {
return cost;
}
}

View File

@ -69,7 +69,7 @@ class ReadSegmentCost {
/**
* Holds the cost value. It public and non-final for convenience.
*/
protected double cost;
private double cost;
/**
* Caches the path bases (the haplotype segment bases).
@ -87,7 +87,15 @@ class ReadSegmentCost {
final Route<MultiDeBruijnVertex, MultiSampleEdge> path, double cost) {
this.read = read;
this.path = path;
this.cost = cost;
setCost(cost);
}
public double getCost() {
return cost;
}
public void setCost(final double value) {
cost = value;
}
/**

View File

@ -46,6 +46,7 @@
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -148,6 +149,11 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
initialize(readCapacity,haplotypeCapacity);
}
@Override
public byte getGapExtensionPenalty() {
return constantGCP;
}
@Override
public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases,
@ -384,9 +390,10 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
for (int kkk = 0; kkk < readQuals.length; kkk++) {
readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk],
mq); // cap base quality by mapping
// TODO -- why is Q18 hard-coded here???
readQuals[kkk] = (readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE
: readQuals[kkk]);
readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE
: Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk]));
readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]);
readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]);
}
this.readBases = readBases;
this.readQuals = readQuals;
@ -691,7 +698,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
private double calculateLocalLikelihoodGeneral(final Problem p) {
initializeMatrixValues(p,null);
int fromCol = p.hapStart + 1;
// int fromCol = p.hapStart + 1;
// if (previousProblem == null) {
// fromCol = p.hapStart + 1;
// } else {
@ -704,7 +711,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
// previousProblem = p;
updateTable(p.readStart + 1, p.readEnd + 1,
fromCol, p.hapEnd + 1);
p.hapStart + 1, p.hapEnd + 1);
if (p.trailing) {
return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1)

View File

@ -97,4 +97,9 @@ public interface FlexibleHMM {
public void loadRead(byte[] bases, byte[] bq, byte[] iq, byte[] dq, int mq);
/**
* Returns the constant gap extension penalty in Phred scale
* @return never @code null.
*/
byte getGapExtensionPenalty();
}

View File

@ -166,6 +166,11 @@ public class LoglessPairHMM extends N2MemoryPairHMM {
transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]);
transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]);
transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]);
//TODO it seems that it is not always the case that matchToMatch + matchToDeletion + matchToInsertion == 1.
//TODO We have detected cases of 1.00002 which can cause problems downstream. This are typically masked
//TODO by the fact that we always add a indelToMatch penalty to all PairHMM likelihoods (~ -0.1)
//TODO This is in fact not well justified and although it does not have any effect (since is equally added to all
//TODO haplotypes likelihoods) perhaps we should just remove it eventually and fix this != 1.0 issue here.
}
}

View File

@ -72,6 +72,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam";
final static String NA12878_PCRFREE250_ADAPTER_TRIMMED = privateTestDir + "PCRFree.2x250.b37_decoy.NA12878.adapter_trimmed-10000000-11000000.bam";
final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam";
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
@ -256,6 +257,32 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@Test
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("a6c4d5d2eece2bd2c51a81e34e80040f"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@Test
public void HCTestDBSNPAnnotationWExGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("69db1045b5445a4f90843f368bd62814"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@Test
public void HCTestGraphBasedPCRFreePositiveLogLkFix() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,000,000-11,000,000 "
, 1,
Arrays.asList(""));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// test PCR indel model