Merge pull request #580 from broadinstitute/vrr_graphbase_infinite_likelihoods_reprise

Fixed bug using GraphBased due to infinite likelihoods resulting from th...
This commit is contained in:
Valentin Ruano Rubio 2014-04-02 00:45:17 -04:00
commit 45c192bb6d
8 changed files with 87 additions and 46 deletions

1
.gitignore vendored
View File

@ -26,3 +26,4 @@ out/
/atlassian-ide-plugin.xml /atlassian-ide-plugin.xml
maven-metadata-local.xml maven-metadata-local.xml
dependency-reduced-pom.xml dependency-reduced-pom.xml
public/external-example/target/

View File

@ -412,28 +412,28 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
for (final GATKSAMRecord read : mayNeedAdjustment) { for (final GATKSAMRecord read : mayNeedAdjustment) {
final Map<Allele, Double> existingLikelihoods = map.get(read); final Map<Allele, Double> existingLikelihoods = map.get(read);
if (existingLikelihoods != null) { if (existingLikelihoods != null) {
Allele bestAllele = null;
double worstRelativeLikelihood = 0; double worstRelativeLikelihood = 0;
double bestRelativeLikelihood = Double.NEGATIVE_INFINITY; double bestRelativeLikelihood = Double.NEGATIVE_INFINITY;
for (final Map.Entry<Allele, Double> entry : map.get(read).entrySet()) { for (final Map.Entry<Allele, Double> entry : map.get(read).entrySet()) {
final double candidateRelativeLikelihood = entry.getValue(); final double candidateRelativeLikelihood = entry.getValue();
if (candidateRelativeLikelihood > bestRelativeLikelihood) { if (candidateRelativeLikelihood > bestRelativeLikelihood) {
bestAllele = entry.getKey();
bestRelativeLikelihood = candidateRelativeLikelihood; bestRelativeLikelihood = candidateRelativeLikelihood;
} }
if (!Double.isInfinite(candidateRelativeLikelihood) && worstRelativeLikelihood > candidateRelativeLikelihood) if (!Double.isInfinite(candidateRelativeLikelihood) && worstRelativeLikelihood > candidateRelativeLikelihood)
worstRelativeLikelihood = candidateRelativeLikelihood; worstRelativeLikelihood = candidateRelativeLikelihood;
} }
if (Double.isInfinite(bestRelativeLikelihood)) {
bestRelativeLikelihood = 0;
worstRelativeLikelihood = 0;
} else {
worstRelativeLikelihood += UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY; worstRelativeLikelihood += UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY;
if (bestAllele == null) }
throw new IllegalStateException("No best allele for read " + read.getReadName());
final double bestLikelihood = 0.0; // the best becomes zero. final double bestLikelihood = 0.0; // the best becomes zero.
maxAlternative.put(read, bestLikelihood); maxAlternative.put(read, bestLikelihood);
for (final Map.Entry<Haplotype, Allele> entry : alleleVersions.entrySet()) { for (final Map.Entry<Haplotype, Allele> entry : alleleVersions.entrySet()) {
final Allele a = entry.getValue(); final Allele a = entry.getValue();
final Double relativeLikelihoodO = existingLikelihoods.get(a); final Double relativeLikelihoodO = existingLikelihoods.get(a);
final double relativeLikelihood = relativeLikelihoodO == null ? worstRelativeLikelihood : relativeLikelihoodO; final double relativeLikelihood = relativeLikelihoodO == null || Double.isInfinite(relativeLikelihoodO) ? worstRelativeLikelihood : relativeLikelihoodO;
final double likelihood = relativeLikelihood - bestRelativeLikelihood + bestLikelihood; final double likelihood = relativeLikelihood - bestRelativeLikelihood + bestLikelihood;
if (likelihood > 0) if (likelihood > 0)
throw new IllegalStateException("Likelihood larger than 1 with read " + read.getReadName()); throw new IllegalStateException("Likelihood larger than 1 with read " + read.getReadName());
@ -628,7 +628,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
// Complete the read segment cost with the corresponding path bases // Complete the read segment cost with the corresponding path bases
for (final Route<MultiDeBruijnVertex, MultiSampleEdge> p : acrossBlockPaths) { for (final Route<MultiDeBruijnVertex, MultiSampleEdge> p : acrossBlockPaths) {
final ReadSegmentCost readSegmentCost = new ReadSegmentCost(read, p, Double.NaN); final ReadSegmentCost readSegmentCost = new ReadSegmentCost(read, p, 0);
pathBases[nextPath++] = readSegmentCost.bases = eventBlockPathBases(p, includePathEnds); pathBases[nextPath++] = readSegmentCost.bases = eventBlockPathBases(p, includePathEnds);
pathSizes.add(readSegmentCost.bases.length); pathSizes.add(readSegmentCost.bases.length);
readSegmentCosts.add(readSegmentCost); readSegmentCosts.add(readSegmentCost);
@ -652,8 +652,6 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
readSegmentCost.path = new Route<>(readSegmentCost.path,appendVertex); readSegmentCost.path = new Route<>(readSegmentCost.path,appendVertex);
addReadSegmentCost(destination,readSegmentCost); addReadSegmentCost(destination,readSegmentCost);
} }
} }
/** /**

View File

@ -1151,7 +1151,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
for( final GATKSAMRecord read : reads ) { for( final GATKSAMRecord read : reads ) {
returnMap.get(read.getReadGroup().getSample()).add(read); returnMap.get(read.getReadGroup().getSample()).add(read);
} }
return returnMap; return returnMap;
} }

View File

@ -45,6 +45,7 @@
*/ */
package org.broadinstitute.sting.gatk.walkers.haplotypecaller; package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Comparator; import java.util.Comparator;
@ -53,6 +54,10 @@ import java.util.Comparator;
* A pair read-likelihood (cost). * A pair read-likelihood (cost).
*/ */
public class ReadCost { public class ReadCost {
/**
* Reference to the read record this cost makes reference to.
*/
public final GATKSAMRecord read; public final GATKSAMRecord read;
/** /**
@ -71,13 +76,12 @@ public class ReadCost {
*/ */
public ReadCost(final GATKSAMRecord r, final double initialCost) { public ReadCost(final GATKSAMRecord r, final double initialCost) {
if (r == null) throw new NullPointerException(); if (r == null) throw new NullPointerException();
if (Double.isNaN(initialCost) || Double.isInfinite(initialCost) || initialCost > 0) if (!MathUtils.goodLog10Probability(initialCost))
throw new IllegalArgumentException("initial cost must be a finite 0 or negative value (" + initialCost + ")"); throw new IllegalArgumentException("initial cost must be a finite 0 or negative value (" + initialCost + ")");
read = r; read = r;
cost = initialCost; cost = initialCost;
} }
/** /**
* Comparator used to sort ReadCosts * Comparator used to sort ReadCosts
*/ */
@ -90,15 +94,15 @@ public class ReadCost {
} }
}; };
/** /**
* Add to the cost. * Add to the cost.
* @param value value to add. * @param value value to add.
*/ */
public void addCost(final double value) { public void addCost(final double value) {
if (cost + value > 0) final double previousCost = cost;
throw new IllegalArgumentException("value brings cost over 0. Current cost " + cost + " value " + value);
cost += value; cost += value;
if (!MathUtils.goodLog10Probability(cost))
throw new IllegalArgumentException("invalid log10 likelihood value (" + cost + ") after adding (" + value + ") to (" + previousCost + ")");
} }
/** /**
@ -108,5 +112,4 @@ public class ReadCost {
public double getCost() { public double getCost() {
return cost; return cost;
} }
} }

View File

@ -45,10 +45,10 @@
*/ */
package org.broadinstitute.sting.gatk.walkers.haplotypecaller; package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
@ -63,7 +63,17 @@ import java.util.concurrent.atomic.AtomicLong;
*/ */
class ReadSegmentCost { class ReadSegmentCost {
/**
* Reference to the path the cost makes reference to. Public and writable for convenience, notice that this is not
* a public class.
*/
public Route<MultiDeBruijnVertex, MultiSampleEdge> path; public Route<MultiDeBruijnVertex, MultiSampleEdge> path;
/**
* Reference to the read the cost makes reference to. Public and writable for convenience, notice that this is not
* a public class.
*/
public GATKSAMRecord read; public GATKSAMRecord read;
/** /**
@ -78,23 +88,38 @@ class ReadSegmentCost {
/** /**
* Construct a new path cost. * Construct a new path cost.
*
* @param read the corresponding read. * @param read the corresponding read.
* @param path the corresponding path. * @param path the corresponding path.
* @param cost initial cost estimate. Might be updated later. * @param cost initial cost estimate. Might be updated later.
*/ */
@Requires("route != null")
public ReadSegmentCost(final GATKSAMRecord read, public ReadSegmentCost(final GATKSAMRecord read,
final Route<MultiDeBruijnVertex, MultiSampleEdge> path, double cost) { final Route<MultiDeBruijnVertex, MultiSampleEdge> path, final double cost) {
if (read == null)
throw new IllegalArgumentException("the read provided cannot be null");
if (path == null)
throw new IllegalArgumentException("the path provided cannot be null");
this.read = read; this.read = read;
this.path = path; this.path = path;
setCost(cost); setCost(cost);
} }
/**
* Returns the cost of a read vs a haplotype.
*
* @return a valid log10 likelihood
*/
public double getCost() { public double getCost() {
return cost; return cost;
} }
/**
* Changes the cost of the current path vs read combination.
* @param value
*/
public void setCost(final double value) { public void setCost(final double value) {
if (!MathUtils.goodLog10Probability(value))
throw new IllegalArgumentException("infinite cost are not allowed");
cost = value; cost = value;
} }
@ -110,7 +135,7 @@ class ReadSegmentCost {
/** /**
* Returns the this path-cost unique identifier. * Returns the this path-cost unique identifier.
* @return * @return never {@code unique}.
*/ */
public long uniqueId() { public long uniqueId() {
if (uniqueId == null) if (uniqueId == null)

View File

@ -62,7 +62,6 @@ import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*;
*/ */
public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
/** /**
* Initial read length capacity. * Initial read length capacity.
*/ */
@ -89,6 +88,10 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
private int maxToCol; private int maxToCol;
private int haplotypeLength; private int haplotypeLength;
private double[][] logTransition;
private boolean indelToIndelIsConstant;
/** /**
* Returns the currently loaded read base qualities. * Returns the currently loaded read base qualities.
* *
@ -140,7 +143,6 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
return readGepQuals; return readGepQuals;
} }
/** /**
* Creates a new pair-hmm calculator instance give the gap continuation penalty. * Creates a new pair-hmm calculator instance give the gap continuation penalty.
* *
@ -374,8 +376,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
*/ */
public void loadRead(final byte[] readBases, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, int mq) { public void loadRead(final byte[] readBases, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, int mq) {
// TODO This is a copy&paste from PairHMM*Engine read data preparation code. // TODO This is a copy&paste from PairHMM*Engine read data preparation code.
// TODO It is simply to difficult to share the code without changing that class and I don't want // TODO It is simply to difficult to share the code without changing that class
// TODO to do so for now.
if (readBases.length != readQuals.length) throw new IllegalArgumentException("the read quality array length does not match the read base array length"); if (readBases.length != readQuals.length) throw new IllegalArgumentException("the read quality array length does not match the read base array length");
if (readBases.length != readInsQuals.length) throw new IllegalArgumentException("the read insert quality array length does not match the read base array length"); if (readBases.length != readInsQuals.length) throw new IllegalArgumentException("the read insert quality array length does not match the read base array length");
if (readBases.length != readDelQuals.length) throw new IllegalArgumentException("the read deletion quality length does not match the read base array length"); if (readBases.length != readDelQuals.length) throw new IllegalArgumentException("the read deletion quality length does not match the read base array length");
@ -402,12 +403,28 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
this.readInsQuals = readInsQuals; this.readInsQuals = readInsQuals;
this.readDelQuals = readDelQuals; this.readDelQuals = readDelQuals;
this.readGepQuals = overallGCP; this.readGepQuals = overallGCP;
initializeProbabilities(transition,readInsQuals, readDelQuals, overallGCP); initializeProbabilities(transition,readInsQuals, readDelQuals, overallGCP);
PairHMMModel.qualToTransProbsLog10(logTransition,readInsQuals,readDelQuals,overallGCP);
indelToIndelIsConstant = true;
for (final double d : overallGCP)
if (d != overallGCP[0]) {
indelToIndelIsConstant = false;
break;
}
if (haplotypeBases != null) if (haplotypeBases != null)
fillPriorsTable(0); fillPriorsTable(0);
cachedResults.clear(); cachedResults.clear();
} }
@Override
public void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
super.initialize(readMaxLength,haplotypeMaxLength);
logTransition = PairHMMModel.createTransitionMatrix(readMaxLength);
}
@Override @Override
public void loadHaplotypeBases(final byte[] haplotypeBases) { public void loadHaplotypeBases(final byte[] haplotypeBases) {
if (readBases == null) if (readBases == null)
@ -587,30 +604,27 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
* Fast likelihood when the pair-hmm represents a deletion in the read. * Fast likelihood when the pair-hmm represents a deletion in the read.
*/ */
private double calculateLocalLikelihoodDeletion(final int readStart, final int hapStart, final int hapEnd) { private double calculateLocalLikelihoodDeletion(final int readStart, final int hapStart, final int hapEnd) {
double result = INITIAL_CONDITION; if (readStart == 0 || readStart >= readBases.length) // Deletion at the beginning or end not have a cost.
if (readStart > 0) { // no penalty if at the beginning. return 0;
result *= transition[readStart][matchToDeletion]; return logTransition[readStart + 1][matchToDeletion]
result *= + logTransition[readStart + 1][deletionToDeletion] * (hapEnd - hapStart - 1);
StrictMath.pow(transition[readStart][deletionToDeletion],hapEnd - hapStart - 1);
result *= transition[readStart][indelToMatch];
} }
return StrictMath.log10(result) - INITIAL_CONDITION_LOG10;
}
/** /**
* Fast likelihood when the pair-hmm represents a insertion in the read. * Fast likelihood when the pair-hmm represents a insertion in the read.
*/ */
private double calculateLocalLikelihoodInsertion(final int readStart, final int readEnd) { private double calculateLocalLikelihoodInsertion(final int readStart, final int readEnd) {
double result = INITIAL_CONDITION; double result = logTransition[readStart + 1][matchToInsertion];
result *= transition[readStart + 1][matchToInsertion];
for (int i = readStart + 1; i < readEnd; i++) { if (indelToIndelIsConstant)
result *= transition[i + 1][insertionToInsertion]; result += logTransition[readStart + 1][insertionToInsertion] * (readEnd - readStart - 1);
} else
if (readEnd < readBases.length) { for (int i = readStart + 1; i < readEnd; i++)
result *= transition[readEnd + 1][indelToMatch]; result += logTransition[i + 1][insertionToInsertion];
}
return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; if (readEnd < readBases.length)
result += logTransition[readEnd + 1][indelToMatch];
return result;
} }
/** /**

View File

@ -99,12 +99,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerGraphBasedSingleSample() { public void testHaplotypeCallerGraphBasedSingleSample() {
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "83fe0694621bc1e0240f6f79eb6d6999"); HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "f87bb723bfb9b8f604db1d53624d24e3");
} }
@Test @Test
public void testHaplotypeCallerGraphBasedMultiSample() { public void testHaplotypeCallerGraphBasedMultiSample() {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "d45b2b26434dd3bd48df5a43b3d2954a"); HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "891295d8795c0ac894a2aec873e2a1d4");
} }
@Test(enabled = false) // can't annotate the rsID's yet @Test(enabled = false) // can't annotate the rsID's yet
@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() { public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("6f6213bfc62e9cd9db56a7277ebe04e0")); Arrays.asList("d2bb1a904ad419c565b00c3ac98048d1"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
} }

View File

@ -55,6 +55,7 @@ abstract class N2MemoryPairHMM extends PairHMM {
* @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
* @param readMaxLength the max length of reads we want to use with this PairHMM * @param readMaxLength the max length of reads we want to use with this PairHMM
*/ */
@Override
public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { public void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
super.initialize(readMaxLength, haplotypeMaxLength); super.initialize(readMaxLength, haplotypeMaxLength);