diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 01e696237..7d3e7047d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -72,16 +72,4 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { double[][] log10AlleleFrequencyPriors, double[][] log10AlleleFrequencyLikelihoods, double[][] log10AlleleFrequencyPosteriors); - - /** - * Can be overridden by concrete subclasses - * @param vc variant context with genotype likelihoods - * @param log10AlleleFrequencyLikelihoods allele frequency results - * @param AFofMaxLikelihood allele frequency of max likelihood - * - * @return calls - */ - protected abstract GenotypesContext assignGenotypes(VariantContext vc, - double[][] log10AlleleFrequencyLikelihoods, - int AFofMaxLikelihood); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index dccc2c02c..8fbc9f178 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,9 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; @@ -40,9 +38,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private final static boolean DEBUG = false; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - - private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); private final boolean USE_MULTI_ALLELIC_CALCULATION; @@ -73,7 +68,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( sample.hasLikelihoods() ) { double[] gls = sample.getLikelihoods().getAsVector(); - if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL) + if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL ) genotypeLikelihoods.add(gls); } } @@ -584,87 +579,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return coeff; } - - /** - * Can be overridden by concrete subclasses - * @param vc variant context with genotype likelihoods - * @param log10AlleleFrequencyLikelihoods likelihoods - * @param AFofMaxLikelihood allele frequency of max likelihood - * - * @return calls - */ - public GenotypesContext assignGenotypes(VariantContext vc, - double[][] log10AlleleFrequencyLikelihoods, - int AFofMaxLikelihood) { - if ( !vc.isVariant() ) - throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - - GenotypesContext GLs = vc.getGenotypes(); - double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; - - ArrayList sampleIndices = new ArrayList(); - - // todo - optimize initialization - for (int k=0; k <= AFofMaxLikelihood; k++) - for (int j=0; j <= GLs.size(); j++) - pathMetricArray[j][k] = -1e30; - - pathMetricArray[0][0] = 0.0; - - sampleIndices.addAll(GLs.getSampleNamesOrderedByName()); - - GenotypesContext calls = GenotypesContext.create(); - - for (int k = GLs.size(); k > 0; k--) { - int bestGTguess; - String sample = sampleIndices.get(k-1); - Genotype g = GLs.get(sample); - if ( !g.hasLikelihoods() ) - continue; - - ArrayList myAlleles = new ArrayList(); - - double[] likelihoods = g.getLikelihoods().getAsVector(); - - // if there is no mass on the likelihoods, then just no-call the sample - if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) { - calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); - continue; - } - - bestGTguess = Utils.findIndexOfMaxEntry(likelihoods); - - // likelihoods are stored row-wise in lower triangular matrix. IE - // for 2 alleles they have ordering AA,AB,BB - // for 3 alleles they are ordered AA,AB,BB,AC,BC,CC - // Get now alleles corresponding to best index - int kk=0; - boolean done = false; - for (int j=0; j < vc.getNAlleles(); j++) { - for (int i=0; i <= j; i++){ - if (kk++ == bestGTguess) { - if (i==0) - myAlleles.add(vc.getReference()); - else - myAlleles.add(vc.getAlternateAllele(i-1)); - - if (j==0) - myAlleles.add(vc.getReference()); - else - myAlleles.add(vc.getAlternateAllele(j-1)); - done = true; - break; - } - - } - if (done) - break; - } - - final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods); - calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); - } - - return calls; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 4821b3eb8..918f83514 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -59,6 +59,9 @@ public class UnifiedGenotyperEngine { EMIT_ALL_SITES } + protected static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + protected static final double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + // the unified argument collection private final UnifiedArgumentCollection UAC; public UnifiedArgumentCollection getUAC() { return UAC; } @@ -327,18 +330,21 @@ public class UnifiedGenotyperEngine { clearAFarray(log10AlleleFrequencyPosteriors.get()); afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); - // find the most likely frequency - int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[0]); + // TODO -- this is not the right thing mathematically to do! In a case of B=1,C=0 the likelihoods would get added to both AC=0 and AC=1 + double[] collapsedPosteriors = collapseAFarrays(log10AlleleFrequencyPosteriors.get(), vc.getAlternateAlleles().size()); + + // is the most likely frequency conformation AC=0 for all alternate alleles? + boolean bestGuessIsRef = MathUtils.maxElementIndex(collapsedPosteriors) == 0; // calculate p(f>0) - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[0]); + double[] normalizedPosteriors = MathUtils.normalizeFromLog10(collapsedPosteriors); double sum = 0.0; for (int i = 1; i <= N; i++) sum += normalizedPosteriors[i]; double PofF = Math.min(sum, 1.0); // deal with precision errors double phredScaledConfidence; - if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0][0]; @@ -356,21 +362,46 @@ public class UnifiedGenotyperEngine { } // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) { + if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF); } - // strip out the alternate allele(s) if we're making a ref call - Set myAlleles = new HashSet(vc.getAlleles()); - if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { - myAlleles = new HashSet(1); + // strip out any alleles that aren't going to be used + Set myAlleles; + boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; + if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { + myAlleles = new HashSet(vc.getAlleles().size()); myAlleles.add(vc.getReference()); + + // if we're making a reference call then we keep just the ref allele, otherwise we need to determine which ones are okay + if ( !bestGuessIsRef ) { + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + if ( MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]) != 0 ) { + myAlleles.add(vc.getAlternateAllele(i)); + altAllelesToUse[i] = true; + } + } + } + } else { + // use all of the alleles if we are given them by the user + myAlleles = new HashSet(vc.getAlleles()); + for ( int i = 0; i < altAllelesToUse.length; i++ ) + altAllelesToUse[i] = true; } + // start constructing the resulting VC + GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); + VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); + builder.log10PError(phredScaledConfidence/-10.0); + if ( ! passesCallThreshold(phredScaledConfidence) ) + builder.filters(filter); + if ( !limitedContext ) + builder.referenceBaseForIndel(refContext.getBase()); + // create the genotypes - GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyLikelihoods.get(), bestAFguess); + GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse); // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) @@ -383,7 +414,7 @@ public class UnifiedGenotyperEngine { if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - if ( UAC.COMPUTE_SLOD && !limitedContext && bestAFguess != 0 ) { + if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { //final boolean DEBUG_SLOD = false; // the overall lod @@ -428,15 +459,9 @@ public class UnifiedGenotyperEngine { attributes.put("SB", strandScore); } - GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); - - VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); + // finish constructing the resulting VC builder.genotypes(genotypes); - builder.log10PError(phredScaledConfidence/-10.0); - if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter); builder.attributes(attributes); - if ( !limitedContext ) - builder.referenceBaseForIndel(refContext.getBase()); VariantContext vcCall = builder.make(); if ( annotationEngine != null && !limitedContext ) { @@ -454,6 +479,21 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } + private static double[] collapseAFarrays(double[][] original, int numDimensions) { + int size = original[0].length; + double[] newArray = new double[size]; + for ( int i = 0; i < size; i++) + newArray[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + + for ( int i = 0; i < numDimensions; i++ ) { + for ( int j = 0; j < size; j++ ) { + newArray[j] = ExactAFCalculationModel.approximateLog10SumLog10(newArray[j], original[i][j]); + } + } + + return newArray; + } + private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { // TODO - temp fix until we can deal with extended events properly // for indels, stop location is one more than ref allele length @@ -634,8 +674,8 @@ public class UnifiedGenotyperEngine { verboseWriter.println(); } - protected boolean passesEmitThreshold(double conf, int bestAFguess) { - return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); + protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) { + return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); } protected boolean passesCallThreshold(double conf) { @@ -780,4 +820,88 @@ public class UnifiedGenotyperEngine { return vc; } + + /** + * @param vc variant context with genotype likelihoods + * @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use + * + * @return genotypes + */ + public GenotypesContext assignGenotypes(VariantContext vc, + boolean[] allelesToUse) { + + final GenotypesContext GLs = vc.getGenotypes(); + + final List sampleIndices = GLs.getSampleNamesOrderedByName(); + + final GenotypesContext calls = GenotypesContext.create(); + + for ( int k = GLs.size() - 1; k >= 0; k-- ) { + final String sample = sampleIndices.get(k); + final Genotype g = GLs.get(sample); + if ( !g.hasLikelihoods() ) + continue; + + final double[] likelihoods = g.getLikelihoods().getAsVector(); + + // if there is no mass on the likelihoods, then just no-call the sample + if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) { + calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + continue; + } + + // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. + // so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. + + final int numAltAlleles = allelesToUse.length; + + // start with the assumption that the ideal genotype is homozygous reference + Allele maxAllele1 = vc.getReference(), maxAllele2 = vc.getReference(); + double maxLikelihoodSeen = likelihoods[0]; + int indexOfMax = 0; + + // keep track of some state + Allele firstAllele = vc.getReference(); + int subtractor = numAltAlleles + 1; + int subtractionsMade = 0; + + for ( int i = 1, PLindex = 1; i < likelihoods.length; i++, PLindex++ ) { + if ( PLindex == subtractor ) { + firstAllele = vc.getAlternateAllele(subtractionsMade); + PLindex -= subtractor; + subtractor--; + subtractionsMade++; + + // we can skip this allele if it's not usable + if ( !allelesToUse[subtractionsMade-1] ) { + i += subtractor - 1; + PLindex += subtractor - 1; + continue; + } + } + + // we don't care about the entry if we've already seen better + if ( likelihoods[i] <= maxLikelihoodSeen ) + continue; + + // if it's usable then update the alleles + int alleleIndex = subtractionsMade + PLindex - 1; + if ( allelesToUse[alleleIndex] ) { + maxAllele1 = firstAllele; + maxAllele2 = vc.getAlternateAllele(alleleIndex); + maxLikelihoodSeen = likelihoods[i]; + indexOfMax = i; + } + } + + ArrayList myAlleles = new ArrayList(); + myAlleles.add(maxAllele1); + myAlleles.add(maxAllele2); + + final double qual = GenotypeLikelihoods.getQualFromLikelihoods(indexOfMax, likelihoods); + calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); + } + + return calls; + } }