a)New variant eval stratification module for indel size. b) Next iteration on indel caller runtime optimization: when computing likelihood of each haplotype for a given read, many computations will be redundant since pieces of haplotypes will be common to both REF and ALT haplotypes. So, we keep HMM matrices from one haplotype to the next one and recompute starting at the part where either haplotype is different or GOP/GCP are different.

This commit is contained in:
Guillermo del Angel 2011-10-25 09:56:43 -04:00
parent bf61393a7d
commit b559936b7a
2 changed files with 227 additions and 109 deletions

View File

@ -32,11 +32,17 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.io.File;
import java.io.FileWriter;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
@ -45,9 +51,6 @@ import java.util.LinkedHashMap;
public class PairHMMIndelErrorModel { public class PairHMMIndelErrorModel {
public static final int BASE_QUAL_THRESHOLD = 20; public static final int BASE_QUAL_THRESHOLD = 20;
private final double logGapOpenProbability;
private final double logGapContinuationProbability;
private boolean DEBUG = false; private boolean DEBUG = false;
private boolean bandedLikelihoods = false; private boolean bandedLikelihoods = false;
@ -89,8 +92,6 @@ public class PairHMMIndelErrorModel {
} }
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean bandedLikelihoods) { public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean bandedLikelihoods) {
this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob
this.DEBUG = deb; this.DEBUG = deb;
this.bandedLikelihoods = bandedLikelihoods; this.bandedLikelihoods = bandedLikelihoods;
@ -98,13 +99,14 @@ public class PairHMMIndelErrorModel {
this.GAP_CONT_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; this.GAP_CONT_PROB_TABLE = new double[MAX_HRUN_GAP_IDX];
this.GAP_OPEN_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; this.GAP_OPEN_PROB_TABLE = new double[MAX_HRUN_GAP_IDX];
double gop = -indelGOP/10.0;
double gcp = -indelGCP/10.0;
for (int i = 0; i < START_HRUN_GAP_IDX; i++) { for (int i = 0; i < START_HRUN_GAP_IDX; i++) {
GAP_OPEN_PROB_TABLE[i] = logGapOpenProbability; GAP_OPEN_PROB_TABLE[i] = gop;
GAP_CONT_PROB_TABLE[i] = logGapContinuationProbability; GAP_CONT_PROB_TABLE[i] = gcp;
} }
double gop = logGapOpenProbability;
double gcp = logGapContinuationProbability;
double step = GAP_PENALTY_HRUN_STEP/10.0; double step = GAP_PENALTY_HRUN_STEP/10.0;
double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob
@ -185,19 +187,15 @@ public class PairHMMIndelErrorModel {
} }
private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals,
double[] currentGOP, double[] currentGCP, int eventLength) { double[] currentGOP, double[] currentGCP, int indToStart,
double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) {
final int X_METRIC_LENGTH = readBases.length+1; final int X_METRIC_LENGTH = readBases.length+1;
final int Y_METRIC_LENGTH = haplotypeBases.length+1; final int Y_METRIC_LENGTH = haplotypeBases.length+1;
// initialize path metric and traceback memories for likelihood computation if (indToStart == 0) {
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double DIAG_TOL = 20; // means that max - min element in diags have to be > this number for banding to take effect.
// default initialization for all arrays // default initialization for all arrays
for (int i=0; i < X_METRIC_LENGTH; i++) { for (int i=0; i < X_METRIC_LENGTH; i++) {
Arrays.fill(matchMetricArray[i],Double.NEGATIVE_INFINITY); Arrays.fill(matchMetricArray[i],Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[i],Double.NEGATIVE_INFINITY); Arrays.fill(YMetricArray[i],Double.NEGATIVE_INFINITY);
@ -215,16 +213,18 @@ public class PairHMMIndelErrorModel {
} }
matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY; matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY;
XMetricArray[0][0]= YMetricArray[0][0] = 0; XMetricArray[0][0]= YMetricArray[0][0] = 0;
}
if (bandedLikelihoods) {
final double DIAG_TOL = 20; // means that max - min element in diags have to be > this number for banding to take effect.
final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH -1; final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH -1;
final int elemsInDiag = Math.min(X_METRIC_LENGTH, Y_METRIC_LENGTH); final int elemsInDiag = Math.min(X_METRIC_LENGTH, Y_METRIC_LENGTH);
int idxWithMaxElement = 0; int idxWithMaxElement = 0;
double maxElementInDiag = Double.NEGATIVE_INFINITY; for (int diag=indToStart; diag < numDiags; diag++) {
for (int diag=0; diag < numDiags; diag++) {
// compute default I and J start positions at edge of diagonals // compute default I and J start positions at edge of diagonals
int indI = 0; int indI = 0;
int indJ = diag; int indJ = diag;
@ -234,11 +234,10 @@ public class PairHMMIndelErrorModel {
} }
// first pass: from max element to edge // first pass: from max element to edge
int idxLow = bandedLikelihoods? idxWithMaxElement : 0; int idxLow = idxWithMaxElement;
// reset diag max value before starting // reset diag max value before starting
if (bandedLikelihoods) { double maxElementInDiag = Double.NEGATIVE_INFINITY;
maxElementInDiag = Double.NEGATIVE_INFINITY;
// set indI, indJ to correct values // set indI, indJ to correct values
indI += idxLow; indI += idxLow;
indJ -= idxLow; indJ -= idxLow;
@ -248,13 +247,11 @@ public class PairHMMIndelErrorModel {
indJ++; indJ++;
} }
}
for (int el = idxLow; el < elemsInDiag; el++) { for (int el = idxLow; el < elemsInDiag; el++) {
updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases,
currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray);
// update max in diagonal // update max in diagonal
if (bandedLikelihoods) {
final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]);
// check if we've fallen off diagonal value by threshold // check if we've fallen off diagonal value by threshold
@ -262,9 +259,8 @@ public class PairHMMIndelErrorModel {
maxElementInDiag = bestMetric; maxElementInDiag = bestMetric;
idxWithMaxElement = el; idxWithMaxElement = el;
} }
else if (bestMetric < maxElementInDiag - DIAG_TOL) else if (bestMetric < maxElementInDiag - DIAG_TOL && idxWithMaxElement > 0)
break; // done w/current diagonal break; // done w/current diagonal
}
indI++; indI++;
if (indI >=X_METRIC_LENGTH ) if (indI >=X_METRIC_LENGTH )
@ -273,7 +269,7 @@ public class PairHMMIndelErrorModel {
if (indJ <= 0) if (indJ <= 0)
break; break;
} }
if (bandedLikelihoods && idxLow > 0) { if (idxLow > 0) {
// now do second part in opposite direction // now do second part in opposite direction
indI = 0; indI = 0;
indJ = diag; indJ = diag;
@ -310,20 +306,34 @@ public class PairHMMIndelErrorModel {
// if (DEBUG) // if (DEBUG)
// System.out.format("Max:%4.1f el:%d\n",maxElementInDiag, idxWithMaxElement); // System.out.format("Max:%4.1f el:%d\n",maxElementInDiag, idxWithMaxElement);
} }
}
else {
// simplified rectangular version of update loop
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
for (int indJ=indToStart+1; indJ < Y_METRIC_LENGTH; indJ++) {
updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases,
currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray);
}
}
}
final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1; final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1;
final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ], final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ],
XMetricArray[bestI][bestJ], XMetricArray[bestI][bestJ],
YMetricArray[bestI][bestJ]); YMetricArray[bestI][bestJ]);
/* /*
if (DEBUG) { if (DEBUG) {
PrintStream outx, outy, outm, outs; PrintStream outx, outy, outm, outs;
double[][] sumMetrics = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; double[][] sumMetrics = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
try { try {
outx = new PrintStream("../../UGOptim/datax.txt"); outx = new PrintStream("datax.txt");
outy = new PrintStream("../../UGOptim/datay.txt"); outy = new PrintStream("datay.txt");
outm = new PrintStream("../../UGOptim/datam.txt"); outm = new PrintStream("datam.txt");
outs = new PrintStream("../../UGOptim/datas.txt"); outs = new PrintStream("datas.txt");
double metrics[] = new double[3]; double metrics[] = new double[3];
for (int indI=0; indI < X_METRIC_LENGTH; indI++) { for (int indI=0; indI < X_METRIC_LENGTH; indI++) {
for (int indJ=0; indJ < Y_METRIC_LENGTH; indJ++) { for (int indJ=0; indJ < Y_METRIC_LENGTH; indJ++) {
@ -410,12 +420,14 @@ public class PairHMMIndelErrorModel {
if (read == null) if (read == null)
continue; continue;
if ( isReduced ) {
read = ReadUtils.reducedReadWithReducedQuals(read);
}
if(ReadUtils.is454Read(read)) { if(ReadUtils.is454Read(read)) {
continue; continue;
} }
double[] recalQuals = null;
// get bases of candidate haplotypes that overlap with reads // get bases of candidate haplotypes that overlap with reads
final int trailingBases = 3; final int trailingBases = 3;
@ -534,6 +546,12 @@ public class PairHMMIndelErrorModel {
unclippedReadBases.length-numEndClippedBases); unclippedReadBases.length-numEndClippedBases);
int j=0; int j=0;
// initialize path metric and traceback memories for likelihood computation
double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null;
byte[] previousHaplotypeSeen = null;
double[] previousGOP = null;
int startIdx;
for (Allele a: haplotypeMap.keySet()) { for (Allele a: haplotypeMap.keySet()) {
@ -551,11 +569,37 @@ public class PairHMMIndelErrorModel {
byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(), byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
(int)indStart, (int)indStop); (int)indStart, (int)indStop);
double readLikelihood;
if (matchMetricArray == null) {
final int X_METRIC_LENGTH = readBases.length+1;
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
}
final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
final double readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, if (previousHaplotypeSeen == null)
currentContextGOP, currentContextGCP, eventLength); startIdx = 0;
else {
int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP);
startIdx = Math.min(s1,s2);
}
previousHaplotypeSeen = haplotypeBases.clone();
previousGOP = currentContextGOP.clone();
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals,
currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray);
if (DEBUG) {
System.out.println("H:"+new String(haplotypeBases));
System.out.println("R:"+new String(readBases));
System.out.format("L:%4.2f\n",readLikelihood);
System.out.format("StPos:%d\n", startIdx);
}
readEl.put(a,readLikelihood); readEl.put(a,readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood; readLikelihoods[readIdx][j++] = readLikelihood;
} }
@ -579,6 +623,28 @@ public class PairHMMIndelErrorModel {
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
} }
private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
for (int i=0; i < b1.length; i++ ){
if ( b1[i]!= b2[i])
return i;
}
return 0; // sanity check
}
private int computeFirstDifferingPosition(double[] b1, double[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
for (int i=0; i < b1.length; i++ ){
if ( b1[i]!= b2[i])
return i;
}
return 0; // sanity check
}
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];

View File

@ -0,0 +1,52 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.List;
/**
* Stratifies the eval RODs by the indel size
*
* Indel sizes are stratified from sizes -100 to +100. Sizes greater than this are lumped in the +/- 100 bin
* This stratification ignores multi-allelic indels (whose size is not defined uniquely)
*/
public class IndelSize extends VariantStratifier {
static final int MAX_INDEL_SIZE = 100;
@Override
public void initialize() {
states = new ArrayList<String>();
for( int a=-MAX_INDEL_SIZE; a <=MAX_INDEL_SIZE; a++ ) {
states.add(String.format("%d", a));
}
}
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
ArrayList<String> relevantStates = new ArrayList<String>();
if (eval != null && eval.isIndel() && eval.isBiallelic()) {
try {
int eventLength = 0;
if ( eval.isSimpleInsertion() ) {
eventLength = eval.getAlternateAllele(0).length();
} else if ( eval.isSimpleDeletion() ) {
eventLength = -eval.getReference().length();
}
if (eventLength > MAX_INDEL_SIZE)
eventLength = MAX_INDEL_SIZE;
else if (eventLength < -MAX_INDEL_SIZE)
eventLength = -MAX_INDEL_SIZE;
relevantStates.add(String.format("%d",eventLength));
} catch (Exception e) {
return relevantStates;
}
}
return relevantStates;
}
}