a) Minor optimization to softMax() computation to avoid redundant operations, results in about 5-10% increase in speed in indel calling.

b) Added (but left commented out since it may affect integration tests and to isolate commits) fix to per-sample DP reporting, so that deletions are included in count.
c) Bug fix to avoid having non-reference genotypes assigned to samples with PL=0,0,0. Correct behavior should be to no-call these samples, and to ignore these samples when computing AC distribution since their likelihoods are not informative.
This commit is contained in:
Guillermo del Angel 2011-09-09 18:00:23 -04:00
parent b318fcba35
commit a807205fc3
3 changed files with 95 additions and 63 deletions

View File

@ -63,7 +63,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private boolean SIMPLE_GREEDY_GENOTYPER = false;
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
final private ExactCalculation calcToUse;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
@ -178,22 +178,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
private static final double[][] getGLs(Map<String, Genotype> GLs) {
double[][] genotypeLikelihoods = new double[GLs.size()+1][];
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
int j = 0;
//int j = 0;
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.values() ) {
j++;
if ( sample.hasLikelihoods() ) {
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector();
double[] gls = sample.getLikelihoods().getAsVector();
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
genotypeLikelihoods.add(gls);
}
}
return genotypeLikelihoods;
}
// -------------------------------------------------------------------------------------
//
// Linearized, ~O(N), implementation.
@ -318,9 +321,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public int linearExact(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
final int numSamples = GLs.size();
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
final double[][] genotypeLikelihoods = getGLs(GLs);
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
@ -334,14 +337,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA];
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods[j];
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
@ -434,10 +437,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
boolean multiAllelicRecord = false;
if (vc.getAlternateAlleles().size() > 1)
multiAllelicRecord = true;
Map<String, Genotype> GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
@ -454,7 +453,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
pathMetricArray[0][0] = 0.0;
// todo = can't deal with optimal dynamic programming solution with multiallelic records
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) {
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
sampleIndices.addAll(GLs.keySet());
sampleIdx = GLs.size();
}
@ -465,6 +464,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
continue;
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
//System.out.print(sample.getKey()+":");
//for (int k=0; k < likelihoods.length; k++)
// System.out.format("%4.2f ",likelihoods[k]);
//System.out.println();
// all likelihoods are essentially the same: skip this sample and will later on force no call.
//sampleIdx++;
continue;
}
sampleIndices.add(sample.getKey());
for (int k=0; k <= AFofMaxLikelihood; k++) {
@ -504,22 +514,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord)
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
else {
int newIdx = tracebackArray[k][startIdx];
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
// and will add no-call genotype to GL's in a second pass
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double qual = Double.NEGATIVE_INFINITY;
double[] likelihoods = g.getLikelihoods().getAsVector();
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
}
else {
int newIdx = tracebackArray[k][startIdx];;
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
/* System.out.format("Sample: %s GL:",sample);
for (int i=0; i < likelihoods.length; i++)
System.out.format("%1.4f ",likelihoods[i]);
System.out.format("%1.4f, ",likelihoods[i]);
*/
for (int i=0; i < likelihoods.length; i++) {
@ -570,6 +583,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
if ( !sample.getValue().hasLikelihoods() )
continue;
Genotype g = GLs.get(sample.getKey());
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
continue; // regular likelihoods
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double qual = Genotype.NO_NEG_LOG_10PERROR;
myAlleles.add(Allele.NO_CALL);
myAlleles.add(Allele.NO_CALL);
//System.out.println(myAlleles.toString());
calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
}
return calls;
}

View File

@ -32,7 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
@ -413,16 +415,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (pileup != null ) {
double[] genotypeLikelihoods;
if (useOldWrongHorribleHackedUpLikelihoodModel)
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
else
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
// which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example
// for 3 alleles it's 00 01 11 02 12 22
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
alleleList,
genotypeLikelihoods,
getFilteredDepth(pileup)));
@ -444,4 +444,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
return indelLikelihoodMap.get();
}
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
// so that per-sample DP will include deletions covering the event.
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if (/*p.isDeletion() ||*/ BaseUtils.isRegularBase(p.getBase()) )
count++;
}
return count;
}
}

View File

@ -1056,42 +1056,30 @@ public class MathUtils {
}
static public double softMax(final double x, final double y) {
if (Double.isInfinite(x))
return y;
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
if (Double.isInfinite(y))
return x;
// slow exact version:
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
if (y >= x + MAX_JACOBIAN_TOLERANCE)
return y;
if (x >= y + MAX_JACOBIAN_TOLERANCE)
return x;
double diff = x-y;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
//double diff = Math.abs(x-y);
double diff = x-y;
double t1 =x;
if (diff<0) { //
t1 = y;
diff= -diff;
}
// t has max(x,y), diff has abs(x-y)
// we have pre-stored correction for 0,0.1,0.2,... 10.0
//int ind = (int)Math.round(diff*INV_JACOBIAN_LOG_TABLE_STEP);
int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
// gdebug+
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
//gdebug-
return t1+jacobianLogTable[ind];
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
}
if (diff > MAX_JACOBIAN_TOLERANCE)
return x;
else if (diff < -MAX_JACOBIAN_TOLERANCE)
return y;
else if (diff >= 0) {
int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
return x + jacobianLogTable[ind];
}
else {
int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
return y + jacobianLogTable[ind];
}
}
public static double phredScaleToProbability (byte q) {
return Math.pow(10,(-q)/10.0);