Merge pull request #580 from broadinstitute/vrr_graphbase_infinite_likelihoods_reprise

Fixed bug using GraphBased due to infinite likelihoods resulting from th...
This commit is contained in:
Valentin Ruano Rubio 2014-04-02 00:45:17 -04:00
commit 45c192bb6d
8 changed files with 87 additions and 46 deletions

1
.gitignore vendored
View File

@ -26,3 +26,4 @@ out/
/atlassian-ide-plugin.xml
maven-metadata-local.xml
dependency-reduced-pom.xml
public/external-example/target/

View File

@ -412,28 +412,28 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
for (final GATKSAMRecord read : mayNeedAdjustment) {
final Map<Allele, Double> existingLikelihoods = map.get(read);
if (existingLikelihoods != null) {
Allele bestAllele = null;
double worstRelativeLikelihood = 0;
double bestRelativeLikelihood = Double.NEGATIVE_INFINITY;
for (final Map.Entry<Allele, Double> entry : map.get(read).entrySet()) {
final double candidateRelativeLikelihood = entry.getValue();
if (candidateRelativeLikelihood > bestRelativeLikelihood) {
bestAllele = entry.getKey();
bestRelativeLikelihood = candidateRelativeLikelihood;
}
if (!Double.isInfinite(candidateRelativeLikelihood) && worstRelativeLikelihood > candidateRelativeLikelihood)
worstRelativeLikelihood = candidateRelativeLikelihood;
}
worstRelativeLikelihood += UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY;
if (bestAllele == null)
throw new IllegalStateException("No best allele for read " + read.getReadName());
if (Double.isInfinite(bestRelativeLikelihood)) {
bestRelativeLikelihood = 0;
worstRelativeLikelihood = 0;
} else {
worstRelativeLikelihood += UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY;
}
final double bestLikelihood = 0.0; // the best becomes zero.
maxAlternative.put(read, bestLikelihood);
for (final Map.Entry<Haplotype, Allele> entry : alleleVersions.entrySet()) {
final Allele a = entry.getValue();
final Double relativeLikelihoodO = existingLikelihoods.get(a);
final double relativeLikelihood = relativeLikelihoodO == null ? worstRelativeLikelihood : relativeLikelihoodO;
final double relativeLikelihood = relativeLikelihoodO == null || Double.isInfinite(relativeLikelihoodO) ? worstRelativeLikelihood : relativeLikelihoodO;
final double likelihood = relativeLikelihood - bestRelativeLikelihood + bestLikelihood;
if (likelihood > 0)
throw new IllegalStateException("Likelihood larger than 1 with read " + read.getReadName());
@ -628,7 +628,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
// Complete the read segment cost with the corresponding path bases
for (final Route<MultiDeBruijnVertex, MultiSampleEdge> p : acrossBlockPaths) {
final ReadSegmentCost readSegmentCost = new ReadSegmentCost(read, p, Double.NaN);
final ReadSegmentCost readSegmentCost = new ReadSegmentCost(read, p, 0);
pathBases[nextPath++] = readSegmentCost.bases = eventBlockPathBases(p, includePathEnds);
pathSizes.add(readSegmentCost.bases.length);
readSegmentCosts.add(readSegmentCost);
@ -652,8 +652,6 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
readSegmentCost.path = new Route<>(readSegmentCost.path,appendVertex);
addReadSegmentCost(destination,readSegmentCost);
}
}
/**

View File

@ -1151,7 +1151,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
for( final GATKSAMRecord read : reads ) {
returnMap.get(read.getReadGroup().getSample()).add(read);
}
return returnMap;
}

View File

@ -45,6 +45,7 @@
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Comparator;
@ -53,6 +54,10 @@ import java.util.Comparator;
* A pair read-likelihood (cost).
*/
public class ReadCost {
/**
* Reference to the read record this cost makes reference to.
*/
public final GATKSAMRecord read;
/**
@ -71,13 +76,12 @@ public class ReadCost {
*/
public ReadCost(final GATKSAMRecord r, final double initialCost) {
if (r == null) throw new NullPointerException();
if (Double.isNaN(initialCost) || Double.isInfinite(initialCost) || initialCost > 0)
if (!MathUtils.goodLog10Probability(initialCost))
throw new IllegalArgumentException("initial cost must be a finite 0 or negative value (" + initialCost + ")");
read = r;
cost = initialCost;
}
/**
* Comparator used to sort ReadCosts
*/
@ -90,15 +94,15 @@ public class ReadCost {
}
};
/**
* Add to the cost.
* @param value value to add.
*/
public void addCost(final double value) {
if (cost + value > 0)
throw new IllegalArgumentException("value brings cost over 0. Current cost " + cost + " value " + value);
final double previousCost = cost;
cost += value;
if (!MathUtils.goodLog10Probability(cost))
throw new IllegalArgumentException("invalid log10 likelihood value (" + cost + ") after adding (" + value + ") to (" + previousCost + ")");
}
/**
@ -108,5 +112,4 @@ public class ReadCost {
public double getCost() {
return cost;
}
}

View File

@ -45,10 +45,10 @@
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.concurrent.atomic.AtomicLong;
@ -63,7 +63,17 @@ import java.util.concurrent.atomic.AtomicLong;
*/
class ReadSegmentCost {
/**
* Reference to the path the cost makes reference to. Public and writable for convenience, notice that this is not
* a public class.
*/
public Route<MultiDeBruijnVertex, MultiSampleEdge> path;
/**
* Reference to the read the cost makes reference to. Public and writable for convenience, notice that this is not
* a public class.
*/
public GATKSAMRecord read;
/**
@ -78,23 +88,38 @@ class ReadSegmentCost {
/**
* Construct a new path cost.
*
* @param read the corresponding read.
* @param path the corresponding path.
* @param cost initial cost estimate. Might be updated later.
*/
@Requires("route != null")
public ReadSegmentCost(final GATKSAMRecord read,
final Route<MultiDeBruijnVertex, MultiSampleEdge> path, double cost) {
final Route<MultiDeBruijnVertex, MultiSampleEdge> path, final double cost) {
if (read == null)
throw new IllegalArgumentException("the read provided cannot be null");
if (path == null)
throw new IllegalArgumentException("the path provided cannot be null");
this.read = read;
this.path = path;
setCost(cost);
}
/**
* Returns the cost of a read vs a haplotype.
*
* @return a valid log10 likelihood
*/
public double getCost() {
return cost;
}
/**
* Changes the cost of the current path vs read combination.
* @param value
*/
public void setCost(final double value) {
if (!MathUtils.goodLog10Probability(value))
throw new IllegalArgumentException("infinite cost are not allowed");
cost = value;
}
@ -110,7 +135,7 @@ class ReadSegmentCost {
/**
* Returns the this path-cost unique identifier.
* @return
* @return never {@code unique}.
*/
public long uniqueId() {
if (uniqueId == null)

View File

@ -62,7 +62,6 @@ import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*;
*/
public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
/**
* Initial read length capacity.
*/
@ -89,6 +88,10 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
private int maxToCol;
private int haplotypeLength;
private double[][] logTransition;
private boolean indelToIndelIsConstant;
/**
* Returns the currently loaded read base qualities.
*
@ -140,7 +143,6 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
return readGepQuals;
}
/**
* Creates a new pair-hmm calculator instance give the gap continuation penalty.
*
@ -374,8 +376,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
*/
public void loadRead(final byte[] readBases, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, int mq) {
// TODO This is a copy&paste from PairHMM*Engine read data preparation code.
// TODO It is simply to difficult to share the code without changing that class and I don't want
// TODO to do so for now.
// TODO It is simply to difficult to share the code without changing that class
if (readBases.length != readQuals.length) throw new IllegalArgumentException("the read quality array length does not match the read base array length");
if (readBases.length != readInsQuals.length) throw new IllegalArgumentException("the read insert quality array length does not match the read base array length");
if (readBases.length != readDelQuals.length) throw new IllegalArgumentException("the read deletion quality length does not match the read base array length");
@ -402,12 +403,28 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
this.readInsQuals = readInsQuals;
this.readDelQuals = readDelQuals;
this.readGepQuals = overallGCP;
initializeProbabilities(transition,readInsQuals, readDelQuals, overallGCP);
PairHMMModel.qualToTransProbsLog10(logTransition,readInsQuals,readDelQuals,overallGCP);
indelToIndelIsConstant = true;
for (final double d : overallGCP)
if (d != overallGCP[0]) {
indelToIndelIsConstant = false;
break;
}
if (haplotypeBases != null)
fillPriorsTable(0);
cachedResults.clear();
}
@Override
public void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
super.initialize(readMaxLength,haplotypeMaxLength);
logTransition = PairHMMModel.createTransitionMatrix(readMaxLength);
}
@Override
public void loadHaplotypeBases(final byte[] haplotypeBases) {
if (readBases == null)
@ -587,30 +604,27 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
* Fast likelihood when the pair-hmm represents a deletion in the read.
*/
private double calculateLocalLikelihoodDeletion(final int readStart, final int hapStart, final int hapEnd) {
double result = INITIAL_CONDITION;
if (readStart > 0) { // no penalty if at the beginning.
result *= transition[readStart][matchToDeletion];
result *=
StrictMath.pow(transition[readStart][deletionToDeletion],hapEnd - hapStart - 1);
result *= transition[readStart][indelToMatch];
}
return StrictMath.log10(result) - INITIAL_CONDITION_LOG10;
if (readStart == 0 || readStart >= readBases.length) // Deletion at the beginning or end not have a cost.
return 0;
return logTransition[readStart + 1][matchToDeletion]
+ logTransition[readStart + 1][deletionToDeletion] * (hapEnd - hapStart - 1);
}
/**
* Fast likelihood when the pair-hmm represents a insertion in the read.
*/
private double calculateLocalLikelihoodInsertion(final int readStart, final int readEnd) {
double result = INITIAL_CONDITION;
result *= transition[readStart + 1][matchToInsertion];
for (int i = readStart + 1; i < readEnd; i++) {
result *= transition[i + 1][insertionToInsertion];
}
if (readEnd < readBases.length) {
result *= transition[readEnd + 1][indelToMatch];
}
return StrictMath.log10(result) - INITIAL_CONDITION_LOG10;
double result = logTransition[readStart + 1][matchToInsertion];
if (indelToIndelIsConstant)
result += logTransition[readStart + 1][insertionToInsertion] * (readEnd - readStart - 1);
else
for (int i = readStart + 1; i < readEnd; i++)
result += logTransition[i + 1][insertionToInsertion];
if (readEnd < readBases.length)
result += logTransition[readEnd + 1][indelToMatch];
return result;
}
/**

View File

@ -99,12 +99,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerGraphBasedSingleSample() {
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "83fe0694621bc1e0240f6f79eb6d6999");
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "f87bb723bfb9b8f604db1d53624d24e3");
}
@Test
public void testHaplotypeCallerGraphBasedMultiSample() {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "d45b2b26434dd3bd48df5a43b3d2954a");
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "891295d8795c0ac894a2aec873e2a1d4");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("6f6213bfc62e9cd9db56a7277ebe04e0"));
Arrays.asList("d2bb1a904ad419c565b00c3ac98048d1"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}

View File

@ -55,6 +55,7 @@ abstract class N2MemoryPairHMM extends PairHMM {
* @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
* @param readMaxLength the max length of reads we want to use with this PairHMM
*/
@Override
public void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
super.initialize(readMaxLength, haplotypeMaxLength);