a) Minor optimization to softMax() computation to avoid redundant operations, results in about 5-10% increase in speed in indel calling.

b) Added (but left commented out since it may affect integration tests and to isolate commits) fix to per-sample DP reporting, so that deletions are included in count.
c) Bug fix to avoid having non-reference genotypes assigned to samples with PL=0,0,0. Correct behavior should be to no-call these samples, and to ignore these samples when computing AC distribution since their likelihoods are not informative.
This commit is contained in:
Guillermo del Angel 2011-09-09 18:00:23 -04:00
parent b318fcba35
commit a807205fc3
3 changed files with 95 additions and 63 deletions

View File

@ -63,7 +63,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private boolean SIMPLE_GREEDY_GENOTYPER = false; private boolean SIMPLE_GREEDY_GENOTYPER = false;
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
final private ExactCalculation calcToUse; final private ExactCalculation calcToUse;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
@ -178,22 +178,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
private static final double[][] getGLs(Map<String, Genotype> GLs) { private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
double[][] genotypeLikelihoods = new double[GLs.size()+1][]; ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
int j = 0; //int j = 0;
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.values() ) { for ( Genotype sample : GLs.values() ) {
j++;
if ( sample.hasLikelihoods() ) { if ( sample.hasLikelihoods() ) {
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector(); double[] gls = sample.getLikelihoods().getAsVector();
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
genotypeLikelihoods.add(gls);
} }
} }
return genotypeLikelihoods; return genotypeLikelihoods;
} }
// ------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------
// //
// Linearized, ~O(N), implementation. // Linearized, ~O(N), implementation.
@ -318,9 +321,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public int linearExact(Map<String, Genotype> GLs, public int linearExact(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
final int numSamples = GLs.size(); final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples; final int numChr = 2*numSamples;
final double[][] genotypeLikelihoods = getGLs(GLs);
final ExactACCache logY = new ExactACCache(numSamples+1); final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case logY.getkMinus0()[0] = 0.0; // the zero case
@ -334,14 +337,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( k == 0 ) { // special case for k = 0 if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) { for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA]; kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
} }
} else { // k > 0 } else { // k > 0
final double[] kMinus1 = logY.getkMinus1(); final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2(); final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) { for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods[j]; final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY; double aa = Double.NEGATIVE_INFINITY;
@ -434,10 +437,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( !vc.isVariant() ) if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
boolean multiAllelicRecord = false;
if (vc.getAlternateAlleles().size() > 1)
multiAllelicRecord = true;
Map<String, Genotype> GLs = vc.getGenotypes(); Map<String, Genotype> GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
@ -454,7 +453,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
pathMetricArray[0][0] = 0.0; pathMetricArray[0][0] = 0.0;
// todo = can't deal with optimal dynamic programming solution with multiallelic records // todo = can't deal with optimal dynamic programming solution with multiallelic records
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) { if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
sampleIndices.addAll(GLs.keySet()); sampleIndices.addAll(GLs.keySet());
sampleIdx = GLs.size(); sampleIdx = GLs.size();
} }
@ -465,6 +464,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
continue; continue;
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
//System.out.print(sample.getKey()+":");
//for (int k=0; k < likelihoods.length; k++)
// System.out.format("%4.2f ",likelihoods[k]);
//System.out.println();
// all likelihoods are essentially the same: skip this sample and will later on force no call.
//sampleIdx++;
continue;
}
sampleIndices.add(sample.getKey()); sampleIndices.add(sample.getKey());
for (int k=0; k <= AFofMaxLikelihood; k++) { for (int k=0; k <= AFofMaxLikelihood; k++) {
@ -504,22 +514,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
Genotype g = GLs.get(sample); Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() ) if ( !g.hasLikelihoods() )
continue; continue;
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) // and will add no-call genotype to GL's in a second pass
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
else {
int newIdx = tracebackArray[k][startIdx];
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
ArrayList<Allele> myAlleles = new ArrayList<Allele>(); ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double qual = Double.NEGATIVE_INFINITY; double qual = Double.NEGATIVE_INFINITY;
double[] likelihoods = g.getLikelihoods().getAsVector(); double[] likelihoods = g.getLikelihoods().getAsVector();
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
}
else {
int newIdx = tracebackArray[k][startIdx];;
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
/* System.out.format("Sample: %s GL:",sample); /* System.out.format("Sample: %s GL:",sample);
for (int i=0; i < likelihoods.length; i++) for (int i=0; i < likelihoods.length; i++)
System.out.format("%1.4f ",likelihoods[i]); System.out.format("%1.4f, ",likelihoods[i]);
*/ */
for (int i=0; i < likelihoods.length; i++) { for (int i=0; i < likelihoods.length; i++) {
@ -570,6 +583,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
if ( !sample.getValue().hasLikelihoods() )
continue;
Genotype g = GLs.get(sample.getKey());
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
continue; // regular likelihoods
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double qual = Genotype.NO_NEG_LOG_10PERROR;
myAlleles.add(Allele.NO_CALL);
myAlleles.add(Allele.NO_CALL);
//System.out.println(myAlleles.toString());
calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
}
return calls; return calls;
} }

View File

@ -32,7 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.Haplotype; import org.broadinstitute.sting.utils.genotype.Haplotype;
@ -413,16 +415,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (pileup != null ) { if (pileup != null ) {
double[] genotypeLikelihoods; double[] genotypeLikelihoods;
if (useOldWrongHorribleHackedUpLikelihoodModel) if (useOldWrongHorribleHackedUpLikelihoodModel)
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap); genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
else else
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
// which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example
// for 3 alleles it's 00 01 11 02 12 22
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
alleleList, alleleList,
genotypeLikelihoods, genotypeLikelihoods,
getFilteredDepth(pileup))); getFilteredDepth(pileup)));
@ -444,4 +444,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
return indelLikelihoodMap.get(); return indelLikelihoodMap.get();
} }
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
// so that per-sample DP will include deletions covering the event.
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if (/*p.isDeletion() ||*/ BaseUtils.isRegularBase(p.getBase()) )
count++;
}
return count;
}
} }

View File

@ -1056,42 +1056,30 @@ public class MathUtils {
} }
static public double softMax(final double x, final double y) { static public double softMax(final double x, final double y) {
if (Double.isInfinite(x)) // we need to compute log10(10^x + 10^y)
return y; // By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
if (Double.isInfinite(y)) // slow exact version:
return x; // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
if (y >= x + MAX_JACOBIAN_TOLERANCE) double diff = x-y;
return y;
if (x >= y + MAX_JACOBIAN_TOLERANCE)
return x;
// OK, so |y-x| < tol: we use the following identity then: if (diff > MAX_JACOBIAN_TOLERANCE)
// we need to compute log10(10^x + 10^y) return x;
// By Jacobian logarithm identity, this is equal to else if (diff < -MAX_JACOBIAN_TOLERANCE)
// max(x,y) + log10(1+10^-abs(x-y)) return y;
// we compute the second term as a table lookup else if (diff >= 0) {
// with integer quantization int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
return x + jacobianLogTable[ind];
//double diff = Math.abs(x-y); }
double diff = x-y; else {
double t1 =x; int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
if (diff<0) { // return y + jacobianLogTable[ind];
t1 = y; }
diff= -diff; }
}
// t has max(x,y), diff has abs(x-y)
// we have pre-stored correction for 0,0.1,0.2,... 10.0
//int ind = (int)Math.round(diff*INV_JACOBIAN_LOG_TABLE_STEP);
int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
// gdebug+
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
//gdebug-
return t1+jacobianLogTable[ind];
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
}
public static double phredScaleToProbability (byte q) { public static double phredScaleToProbability (byte q) {
return Math.pow(10,(-q)/10.0); return Math.pow(10,(-q)/10.0);