a)New variant eval stratification module for indel size. b) Next iteration on indel caller runtime optimization: when computing likelihood of each haplotype for a given read, many computations will be redundant since pieces of haplotypes will be common to both REF and ALT haplotypes. So, we keep HMM matrices from one haplotype to the next one and recompute starting at the part where either haplotype is different or GOP/GCP are different.

This commit is contained in:
Guillermo del Angel 2011-10-25 09:56:43 -04:00
parent bf61393a7d
commit b559936b7a
2 changed files with 227 additions and 109 deletions

View File

@ -32,11 +32,17 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.io.File;
import java.io.FileWriter;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
@ -45,9 +51,6 @@ import java.util.LinkedHashMap;
public class PairHMMIndelErrorModel {
public static final int BASE_QUAL_THRESHOLD = 20;
private final double logGapOpenProbability;
private final double logGapContinuationProbability;
private boolean DEBUG = false;
private boolean bandedLikelihoods = false;
@ -89,8 +92,6 @@ public class PairHMMIndelErrorModel {
}
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean bandedLikelihoods) {
this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob
this.DEBUG = deb;
this.bandedLikelihoods = bandedLikelihoods;
@ -98,13 +99,14 @@ public class PairHMMIndelErrorModel {
this.GAP_CONT_PROB_TABLE = new double[MAX_HRUN_GAP_IDX];
this.GAP_OPEN_PROB_TABLE = new double[MAX_HRUN_GAP_IDX];
double gop = -indelGOP/10.0;
double gcp = -indelGCP/10.0;
for (int i = 0; i < START_HRUN_GAP_IDX; i++) {
GAP_OPEN_PROB_TABLE[i] = logGapOpenProbability;
GAP_CONT_PROB_TABLE[i] = logGapContinuationProbability;
GAP_OPEN_PROB_TABLE[i] = gop;
GAP_CONT_PROB_TABLE[i] = gcp;
}
double gop = logGapOpenProbability;
double gcp = logGapContinuationProbability;
double step = GAP_PENALTY_HRUN_STEP/10.0;
double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob
@ -185,60 +187,57 @@ public class PairHMMIndelErrorModel {
}
private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals,
double[] currentGOP, double[] currentGCP, int eventLength) {
double[] currentGOP, double[] currentGCP, int indToStart,
double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) {
final int X_METRIC_LENGTH = readBases.length+1;
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
// initialize path metric and traceback memories for likelihood computation
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
if (indToStart == 0) {
// default initialization for all arrays
final double DIAG_TOL = 20; // means that max - min element in diags have to be > this number for banding to take effect.
// default initialization for all arrays
for (int i=0; i < X_METRIC_LENGTH; i++) {
Arrays.fill(matchMetricArray[i],Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[i],Double.NEGATIVE_INFINITY);
Arrays.fill(XMetricArray[i],Double.NEGATIVE_INFINITY);
}
for (int i=1; i < X_METRIC_LENGTH; i++) {
//initialize first column
XMetricArray[i][0] = END_GAP_COST*(i);
}
for (int j=1; j < Y_METRIC_LENGTH; j++) {
// initialize first row
YMetricArray[0][j] = END_GAP_COST*(j);
}
matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY;
XMetricArray[0][0]= YMetricArray[0][0] = 0;
final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH -1;
final int elemsInDiag = Math.min(X_METRIC_LENGTH, Y_METRIC_LENGTH);
int idxWithMaxElement = 0;
double maxElementInDiag = Double.NEGATIVE_INFINITY;
for (int diag=0; diag < numDiags; diag++) {
// compute default I and J start positions at edge of diagonals
int indI = 0;
int indJ = diag;
if (diag >= Y_METRIC_LENGTH ) {
indI = diag-(Y_METRIC_LENGTH-1);
indJ = Y_METRIC_LENGTH-1;
for (int i=0; i < X_METRIC_LENGTH; i++) {
Arrays.fill(matchMetricArray[i],Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[i],Double.NEGATIVE_INFINITY);
Arrays.fill(XMetricArray[i],Double.NEGATIVE_INFINITY);
}
// first pass: from max element to edge
int idxLow = bandedLikelihoods? idxWithMaxElement : 0;
for (int i=1; i < X_METRIC_LENGTH; i++) {
//initialize first column
XMetricArray[i][0] = END_GAP_COST*(i);
}
// reset diag max value before starting
if (bandedLikelihoods) {
maxElementInDiag = Double.NEGATIVE_INFINITY;
for (int j=1; j < Y_METRIC_LENGTH; j++) {
// initialize first row
YMetricArray[0][j] = END_GAP_COST*(j);
}
matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY;
XMetricArray[0][0]= YMetricArray[0][0] = 0;
}
if (bandedLikelihoods) {
final double DIAG_TOL = 20; // means that max - min element in diags have to be > this number for banding to take effect.
final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH -1;
final int elemsInDiag = Math.min(X_METRIC_LENGTH, Y_METRIC_LENGTH);
int idxWithMaxElement = 0;
for (int diag=indToStart; diag < numDiags; diag++) {
// compute default I and J start positions at edge of diagonals
int indI = 0;
int indJ = diag;
if (diag >= Y_METRIC_LENGTH ) {
indI = diag-(Y_METRIC_LENGTH-1);
indJ = Y_METRIC_LENGTH-1;
}
// first pass: from max element to edge
int idxLow = idxWithMaxElement;
// reset diag max value before starting
double maxElementInDiag = Double.NEGATIVE_INFINITY;
// set indI, indJ to correct values
indI += idxLow;
indJ -= idxLow;
@ -248,46 +247,10 @@ public class PairHMMIndelErrorModel {
indJ++;
}
}
for (int el = idxLow; el < elemsInDiag; el++) {
updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases,
currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray);
// update max in diagonal
if (bandedLikelihoods) {
final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]);
// check if we've fallen off diagonal value by threshold
if (bestMetric > maxElementInDiag) {
maxElementInDiag = bestMetric;
idxWithMaxElement = el;
}
else if (bestMetric < maxElementInDiag - DIAG_TOL)
break; // done w/current diagonal
}
indI++;
if (indI >=X_METRIC_LENGTH )
break;
indJ--;
if (indJ <= 0)
break;
}
if (bandedLikelihoods && idxLow > 0) {
// now do second part in opposite direction
indI = 0;
indJ = diag;
if (diag >= Y_METRIC_LENGTH ) {
indI = diag-(Y_METRIC_LENGTH-1);
indJ = Y_METRIC_LENGTH-1;
}
indI += idxLow-1;
indJ -= idxLow-1;
for (int el = idxLow-1; el >= 0; el--) {
for (int el = idxLow; el < elemsInDiag; el++) {
updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases,
currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray);
currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray);
// update max in diagonal
final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]);
@ -296,34 +259,81 @@ public class PairHMMIndelErrorModel {
maxElementInDiag = bestMetric;
idxWithMaxElement = el;
}
else if (bestMetric < maxElementInDiag - DIAG_TOL)
else if (bestMetric < maxElementInDiag - DIAG_TOL && idxWithMaxElement > 0)
break; // done w/current diagonal
indJ++;
if (indJ >= Y_METRIC_LENGTH )
indI++;
if (indI >=X_METRIC_LENGTH )
break;
indI--;
if (indI <= 0)
indJ--;
if (indJ <= 0)
break;
}
if (idxLow > 0) {
// now do second part in opposite direction
indI = 0;
indJ = diag;
if (diag >= Y_METRIC_LENGTH ) {
indI = diag-(Y_METRIC_LENGTH-1);
indJ = Y_METRIC_LENGTH-1;
}
indI += idxLow-1;
indJ -= idxLow-1;
for (int el = idxLow-1; el >= 0; el--) {
updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases,
currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray);
// update max in diagonal
final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]);
// check if we've fallen off diagonal value by threshold
if (bestMetric > maxElementInDiag) {
maxElementInDiag = bestMetric;
idxWithMaxElement = el;
}
else if (bestMetric < maxElementInDiag - DIAG_TOL)
break; // done w/current diagonal
indJ++;
if (indJ >= Y_METRIC_LENGTH )
break;
indI--;
if (indI <= 0)
break;
}
}
// if (DEBUG)
// System.out.format("Max:%4.1f el:%d\n",maxElementInDiag, idxWithMaxElement);
}
// if (DEBUG)
// System.out.format("Max:%4.1f el:%d\n",maxElementInDiag, idxWithMaxElement);
}
else {
// simplified rectangular version of update loop
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
for (int indJ=indToStart+1; indJ < Y_METRIC_LENGTH; indJ++) {
updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases,
currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray);
}
}
}
final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1;
final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ],
XMetricArray[bestI][bestJ],
YMetricArray[bestI][bestJ]);
/*
/*
if (DEBUG) {
PrintStream outx, outy, outm, outs;
double[][] sumMetrics = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
try {
outx = new PrintStream("../../UGOptim/datax.txt");
outy = new PrintStream("../../UGOptim/datay.txt");
outm = new PrintStream("../../UGOptim/datam.txt");
outs = new PrintStream("../../UGOptim/datas.txt");
outx = new PrintStream("datax.txt");
outy = new PrintStream("datay.txt");
outm = new PrintStream("datam.txt");
outs = new PrintStream("datas.txt");
double metrics[] = new double[3];
for (int indI=0; indI < X_METRIC_LENGTH; indI++) {
for (int indJ=0; indJ < Y_METRIC_LENGTH; indJ++) {
@ -410,12 +420,14 @@ public class PairHMMIndelErrorModel {
if (read == null)
continue;
if ( isReduced ) {
read = ReadUtils.reducedReadWithReducedQuals(read);
}
if(ReadUtils.is454Read(read)) {
continue;
}
double[] recalQuals = null;
// get bases of candidate haplotypes that overlap with reads
final int trailingBases = 3;
@ -534,6 +546,12 @@ public class PairHMMIndelErrorModel {
unclippedReadBases.length-numEndClippedBases);
int j=0;
// initialize path metric and traceback memories for likelihood computation
double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null;
byte[] previousHaplotypeSeen = null;
double[] previousGOP = null;
int startIdx;
for (Allele a: haplotypeMap.keySet()) {
@ -551,11 +569,37 @@ public class PairHMMIndelErrorModel {
byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
(int)indStart, (int)indStop);
double readLikelihood;
if (matchMetricArray == null) {
final int X_METRIC_LENGTH = readBases.length+1;
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
}
final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
final double readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals,
currentContextGOP, currentContextGCP, eventLength);
if (previousHaplotypeSeen == null)
startIdx = 0;
else {
int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP);
startIdx = Math.min(s1,s2);
}
previousHaplotypeSeen = haplotypeBases.clone();
previousGOP = currentContextGOP.clone();
readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals,
currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray);
if (DEBUG) {
System.out.println("H:"+new String(haplotypeBases));
System.out.println("R:"+new String(readBases));
System.out.format("L:%4.2f\n",readLikelihood);
System.out.format("StPos:%d\n", startIdx);
}
readEl.put(a,readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood;
}
@ -579,6 +623,28 @@ public class PairHMMIndelErrorModel {
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
}
private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
for (int i=0; i < b1.length; i++ ){
if ( b1[i]!= b2[i])
return i;
}
return 0; // sanity check
}
private int computeFirstDifferingPosition(double[] b1, double[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
for (int i=0; i < b1.length; i++ ){
if ( b1[i]!= b2[i])
return i;
}
return 0; // sanity check
}
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];

View File

@ -0,0 +1,52 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.List;
/**
* Stratifies the eval RODs by the indel size
*
* Indel sizes are stratified from sizes -100 to +100. Sizes greater than this are lumped in the +/- 100 bin
* This stratification ignores multi-allelic indels (whose size is not defined uniquely)
*/
public class IndelSize extends VariantStratifier {
static final int MAX_INDEL_SIZE = 100;
@Override
public void initialize() {
states = new ArrayList<String>();
for( int a=-MAX_INDEL_SIZE; a <=MAX_INDEL_SIZE; a++ ) {
states.add(String.format("%d", a));
}
}
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
ArrayList<String> relevantStates = new ArrayList<String>();
if (eval != null && eval.isIndel() && eval.isBiallelic()) {
try {
int eventLength = 0;
if ( eval.isSimpleInsertion() ) {
eventLength = eval.getAlternateAllele(0).length();
} else if ( eval.isSimpleDeletion() ) {
eventLength = -eval.getReference().length();
}
if (eventLength > MAX_INDEL_SIZE)
eventLength = MAX_INDEL_SIZE;
else if (eventLength < -MAX_INDEL_SIZE)
eventLength = -MAX_INDEL_SIZE;
relevantStates.add(String.format("%d",eventLength));
} catch (Exception e) {
return relevantStates;
}
}
return relevantStates;
}
}