Fixed issue > 0 log likelihoods using GraphBased likelihood engine reported by Mauricio
Added some integration test to check on the fix
This commit is contained in:
parent
ab33db625f
commit
5db520c6fa
|
|
@ -52,9 +52,9 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
|
|||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.CountSet;
|
||||
import org.broadinstitute.sting.utils.collections.CountSet;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
|
|
@ -117,6 +117,11 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
|
|||
*/
|
||||
private final FlexibleHMM hmm;
|
||||
|
||||
/**
|
||||
* Holds the log10 probability of passing from a indel to a match.
|
||||
*/
|
||||
private final double indelToMatchTransitionLog10Probability;
|
||||
|
||||
/**
|
||||
* Maximum likelihood difference between the reference haplotype and the best alternative haplotype.
|
||||
*
|
||||
|
|
@ -146,6 +151,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
|
|||
throw new IllegalArgumentException("the global reading mismapping rate cannot be positive or zero");
|
||||
|
||||
this.hmm = hmm;
|
||||
this.indelToMatchTransitionLog10Probability = QualityUtils.qualToProbLog10(hmm.getGapExtensionPenalty());
|
||||
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
|
||||
|
||||
haplotypes = new ArrayList<>(assemblyResultSet.getHaplotypeList());
|
||||
|
|
@ -302,9 +308,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
|
|||
for (final Haplotype haplotype : supportedHaplotypes)
|
||||
calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex);
|
||||
|
||||
//TODO Does not seem to be needed in practice:
|
||||
//TODO furhter testing/evaluation required before removing it completely.
|
||||
//makeLikelihoodAdjustment(alleleVersions, result, maxAlternativeAlleleLogLk.keySet(), maxAlternativeAlleleLogLk);
|
||||
makeLikelihoodAdjustment(alleleVersions, result, maxAlleleLogLk.keySet(), maxAlleleLogLk);
|
||||
applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk);
|
||||
return result;
|
||||
}
|
||||
|
|
@ -350,12 +354,12 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
|
|||
final List<ReadCost> readCosts = new ArrayList<>(readCostByRead.values());
|
||||
Collections.sort(readCosts, ReadCost.COMPARATOR);
|
||||
for (final ReadCost rc : readCosts)
|
||||
result.add(rc.read, alleleVersions.get(haplotype), rc.cost);
|
||||
result.add(rc.read, alleleVersions.get(haplotype), rc.getCost());
|
||||
|
||||
for (final ReadCost rc : readCosts) {
|
||||
final Double currentMax = maxAlleleLogLk.get(rc.read);
|
||||
if (currentMax == null || currentMax < rc.cost)
|
||||
maxAlleleLogLk.put(rc.read, rc.cost);
|
||||
if (currentMax == null || currentMax < rc.getCost())
|
||||
maxAlleleLogLk.put(rc.read, rc.getCost());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -379,8 +383,8 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
|
|||
continue;
|
||||
ReadCost rc = readCosts.get(pc.read);
|
||||
if (rc == null)
|
||||
readCosts.put(pc.read, rc = new ReadCost(pc.read));
|
||||
rc.cost += pc.cost;
|
||||
readCosts.put(pc.read, rc = new ReadCost(pc.read,indelToMatchTransitionLog10Probability));
|
||||
rc.addCost(pc.getCost());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -641,7 +645,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
|
|||
// Calculate the costs.
|
||||
for (final ReadSegmentCost readSegmentCost : readSegmentCosts) {
|
||||
hmm.loadHaplotypeBases(readSegmentCost.bases);
|
||||
readSegmentCost.cost = hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false);
|
||||
readSegmentCost.setCost(hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false));
|
||||
if (prependVertex != null)
|
||||
readSegmentCost.path = new Route<>(prependVertex,readSegmentCost.path);
|
||||
if (appendVertex != null)
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ import java.util.*;
|
|||
public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculationEngine {
|
||||
private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class);
|
||||
|
||||
private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
|
||||
public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
|
||||
|
||||
private final byte constantGCP;
|
||||
private final double log10globalReadMismappingRate;
|
||||
|
|
|
|||
|
|
@ -58,10 +58,23 @@ public class ReadCost {
|
|||
/**
|
||||
* Holds the cost value. Public for convenience, please use with care.
|
||||
*/
|
||||
public double cost;
|
||||
private double cost;
|
||||
|
||||
public ReadCost(final GATKSAMRecord r) {
|
||||
/**
|
||||
* Create a new read cost object provided the read and the gap extension penalty.
|
||||
*
|
||||
* @param r the read.
|
||||
* @param initialCost the initial cost for the read before any read-segment alignment.
|
||||
*
|
||||
* @throws NullPointerException if {@code r} is {@code null}.
|
||||
* @throws IllegalArgumentException if {@code initialCost} is not a valid likelihood.
|
||||
*/
|
||||
public ReadCost(final GATKSAMRecord r, final double initialCost) {
|
||||
if (r == null) throw new NullPointerException();
|
||||
if (Double.isNaN(initialCost) || Double.isInfinite(initialCost) || initialCost > 0)
|
||||
throw new IllegalArgumentException("initial cost must be a finite 0 or negative value (" + initialCost + ")");
|
||||
read = r;
|
||||
cost = initialCost;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -77,4 +90,23 @@ public class ReadCost {
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Add to the cost.
|
||||
* @param value value to add.
|
||||
*/
|
||||
public void addCost(final double value) {
|
||||
if (cost + value > 0)
|
||||
throw new IllegalArgumentException("value brings cost over 0. Current cost " + cost + " value " + value);
|
||||
cost += value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return cost.
|
||||
* @return 0 or less.
|
||||
*/
|
||||
public double getCost() {
|
||||
return cost;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ class ReadSegmentCost {
|
|||
/**
|
||||
* Holds the cost value. It public and non-final for convenience.
|
||||
*/
|
||||
protected double cost;
|
||||
private double cost;
|
||||
|
||||
/**
|
||||
* Caches the path bases (the haplotype segment bases).
|
||||
|
|
@ -87,7 +87,15 @@ class ReadSegmentCost {
|
|||
final Route<MultiDeBruijnVertex, MultiSampleEdge> path, double cost) {
|
||||
this.read = read;
|
||||
this.path = path;
|
||||
this.cost = cost;
|
||||
setCost(cost);
|
||||
}
|
||||
|
||||
public double getCost() {
|
||||
return cost;
|
||||
}
|
||||
|
||||
public void setCost(final double value) {
|
||||
cost = value;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
|
@ -148,6 +149,11 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
|
|||
initialize(readCapacity,haplotypeCapacity);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte getGapExtensionPenalty() {
|
||||
return constantGCP;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases,
|
||||
|
|
@ -384,9 +390,10 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
|
|||
for (int kkk = 0; kkk < readQuals.length; kkk++) {
|
||||
readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk],
|
||||
mq); // cap base quality by mapping
|
||||
// TODO -- why is Q18 hard-coded here???
|
||||
readQuals[kkk] = (readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE
|
||||
: readQuals[kkk]);
|
||||
readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE
|
||||
: Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk]));
|
||||
readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]);
|
||||
readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]);
|
||||
}
|
||||
this.readBases = readBases;
|
||||
this.readQuals = readQuals;
|
||||
|
|
@ -691,7 +698,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
|
|||
private double calculateLocalLikelihoodGeneral(final Problem p) {
|
||||
|
||||
initializeMatrixValues(p,null);
|
||||
int fromCol = p.hapStart + 1;
|
||||
// int fromCol = p.hapStart + 1;
|
||||
// if (previousProblem == null) {
|
||||
// fromCol = p.hapStart + 1;
|
||||
// } else {
|
||||
|
|
@ -704,7 +711,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
|
|||
// previousProblem = p;
|
||||
|
||||
updateTable(p.readStart + 1, p.readEnd + 1,
|
||||
fromCol, p.hapEnd + 1);
|
||||
p.hapStart + 1, p.hapEnd + 1);
|
||||
|
||||
if (p.trailing) {
|
||||
return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1)
|
||||
|
|
|
|||
|
|
@ -97,4 +97,9 @@ public interface FlexibleHMM {
|
|||
public void loadRead(byte[] bases, byte[] bq, byte[] iq, byte[] dq, int mq);
|
||||
|
||||
|
||||
/**
|
||||
* Returns the constant gap extension penalty in Phred scale
|
||||
* @return never @code null.
|
||||
*/
|
||||
byte getGapExtensionPenalty();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -166,6 +166,11 @@ public class LoglessPairHMM extends N2MemoryPairHMM {
|
|||
transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]);
|
||||
transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
//TODO it seems that it is not always the case that matchToMatch + matchToDeletion + matchToInsertion == 1.
|
||||
//TODO We have detected cases of 1.00002 which can cause problems downstream. This are typically masked
|
||||
//TODO by the fact that we always add a indelToMatch penalty to all PairHMM likelihoods (~ -0.1)
|
||||
//TODO This is in fact not well justified and although it does not have any effect (since is equally added to all
|
||||
//TODO haplotypes likelihoods) perhaps we should just remove it eventually and fix this != 1.0 issue here.
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -72,6 +72,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
|
||||
final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam";
|
||||
final static String NA12878_PCRFREE250_ADAPTER_TRIMMED = privateTestDir + "PCRFree.2x250.b37_decoy.NA12878.adapter_trimmed-10000000-11000000.bam";
|
||||
final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam";
|
||||
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
|
||||
|
||||
|
|
@ -256,6 +257,32 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestDBSNPAnnotationWGSGraphBased() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||
Arrays.asList("a6c4d5d2eece2bd2c51a81e34e80040f"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestDBSNPAnnotationWExGraphBased() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
|
||||
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
|
||||
Arrays.asList("69db1045b5445a4f90843f368bd62814"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestGraphBasedPCRFreePositiveLogLkFix() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,000,000-11,000,000 "
|
||||
, 1,
|
||||
Arrays.asList(""));
|
||||
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// test PCR indel model
|
||||
|
|
|
|||
Loading…
Reference in New Issue