diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index cd006a3cf..6ae437b27 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -63,7 +63,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private boolean SIMPLE_GREEDY_GENOTYPER = false; - + private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. final private ExactCalculation calcToUse; protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { @@ -178,22 +178,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } - private static final double[][] getGLs(Map GLs) { - double[][] genotypeLikelihoods = new double[GLs.size()+1][]; + private static final ArrayList getGLs(Map GLs) { + ArrayList genotypeLikelihoods = new ArrayList(); - int j = 0; + //int j = 0; + genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.values() ) { - j++; - if ( sample.hasLikelihoods() ) { //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); - genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector(); + double[] gls = sample.getLikelihoods().getAsVector(); + + if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL) + genotypeLikelihoods.add(gls); } } return genotypeLikelihoods; } + // ------------------------------------------------------------------------------------- // // Linearized, ~O(N), implementation. @@ -318,9 +321,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public int linearExact(Map GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { - final int numSamples = GLs.size(); + final ArrayList genotypeLikelihoods = getGLs(GLs); + final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; - final double[][] genotypeLikelihoods = getGLs(GLs); final ExactACCache logY = new ExactACCache(numSamples+1); logY.getkMinus0()[0] = 0.0; // the zero case @@ -334,14 +337,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( k == 0 ) { // special case for k = 0 for ( int j=1; j <= numSamples; j++ ) { - kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA]; + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA]; } } else { // k > 0 final double[] kMinus1 = logY.getkMinus1(); final double[] kMinus2 = logY.getkMinus2(); for ( int j=1; j <= numSamples; j++ ) { - final double[] gl = genotypeLikelihoods[j]; + final double[] gl = genotypeLikelihoods.get(j); final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; double aa = Double.NEGATIVE_INFINITY; @@ -434,10 +437,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( !vc.isVariant() ) throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - boolean multiAllelicRecord = false; - - if (vc.getAlternateAlleles().size() > 1) - multiAllelicRecord = true; Map GLs = vc.getGenotypes(); double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; @@ -454,7 +453,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { pathMetricArray[0][0] = 0.0; // todo = can't deal with optimal dynamic programming solution with multiallelic records - if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) { + if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { sampleIndices.addAll(GLs.keySet()); sampleIdx = GLs.size(); } @@ -465,6 +464,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { continue; double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); + + if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) { + //System.out.print(sample.getKey()+":"); + //for (int k=0; k < likelihoods.length; k++) + // System.out.format("%4.2f ",likelihoods[k]); + //System.out.println(); + // all likelihoods are essentially the same: skip this sample and will later on force no call. + //sampleIdx++; + continue; + } + sampleIndices.add(sample.getKey()); for (int k=0; k <= AFofMaxLikelihood; k++) { @@ -504,22 +514,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { Genotype g = GLs.get(sample); if ( !g.hasLikelihoods() ) continue; - - if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) - bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); - else { - int newIdx = tracebackArray[k][startIdx]; - bestGTguess = startIdx - newIdx; - startIdx = newIdx; - } - + // if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now, + // and will add no-call genotype to GL's in a second pass ArrayList myAlleles = new ArrayList(); double qual = Double.NEGATIVE_INFINITY; double[] likelihoods = g.getLikelihoods().getAsVector(); + + if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { + bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); + } + else { + int newIdx = tracebackArray[k][startIdx];; + bestGTguess = startIdx - newIdx; + startIdx = newIdx; + } + /* System.out.format("Sample: %s GL:",sample); for (int i=0; i < likelihoods.length; i++) - System.out.format("%1.4f ",likelihoods[i]); + System.out.format("%1.4f, ",likelihoods[i]); */ for (int i=0; i < likelihoods.length; i++) { @@ -570,6 +583,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } + for ( Map.Entry sample : GLs.entrySet() ) { + + if ( !sample.getValue().hasLikelihoods() ) + continue; + Genotype g = GLs.get(sample.getKey()); + + double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); + + if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL) + continue; // regular likelihoods + + ArrayList myAlleles = new ArrayList(); + + double qual = Genotype.NO_NEG_LOG_10PERROR; + myAlleles.add(Allele.NO_CALL); + myAlleles.add(Allele.NO_CALL); + //System.out.println(myAlleles.toString()); + calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false)); + } return calls; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 07f02de57..2a99f1aad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -32,7 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.genotype.Haplotype; @@ -413,16 +415,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (pileup != null ) { double[] genotypeLikelihoods; + if (useOldWrongHorribleHackedUpLikelihoodModel) genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap); else genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); - - // which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example - // for 3 alleles it's 00 01 11 02 12 22 - GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), + GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), alleleList, genotypeLikelihoods, getFilteredDepth(pileup))); @@ -444,4 +444,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return indelLikelihoodMap.get(); } + // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup, + // so that per-sample DP will include deletions covering the event. + protected int getFilteredDepth(ReadBackedPileup pileup) { + int count = 0; + for ( PileupElement p : pileup ) { + if (/*p.isDeletion() ||*/ BaseUtils.isRegularBase(p.getBase()) ) + count++; + } + + return count; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index f4c057c15..0d85f9606 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1056,42 +1056,30 @@ public class MathUtils { } static public double softMax(final double x, final double y) { - if (Double.isInfinite(x)) - return y; + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup + // with integer quantization - if (Double.isInfinite(y)) - return x; + // slow exact version: + // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - if (y >= x + MAX_JACOBIAN_TOLERANCE) - return y; - if (x >= y + MAX_JACOBIAN_TOLERANCE) - return x; + double diff = x-y; - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - - //double diff = Math.abs(x-y); - double diff = x-y; - double t1 =x; - if (diff<0) { // - t1 = y; - diff= -diff; - } - // t has max(x,y), diff has abs(x-y) - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - //int ind = (int)Math.round(diff*INV_JACOBIAN_LOG_TABLE_STEP); - int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); - // gdebug+ - //double z =Math.log10(1+Math.pow(10.0,-diff)); - //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - //gdebug- - return t1+jacobianLogTable[ind]; - // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - } + if (diff > MAX_JACOBIAN_TOLERANCE) + return x; + else if (diff < -MAX_JACOBIAN_TOLERANCE) + return y; + else if (diff >= 0) { + int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + return x + jacobianLogTable[ind]; + } + else { + int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + return y + jacobianLogTable[ind]; + } + } public static double phredScaleToProbability (byte q) { return Math.pow(10,(-q)/10.0);