From 0f728a0604c7c531b0419361673e8791b403f48b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 9 Feb 2012 14:02:34 -0500 Subject: [PATCH] The Exact model now subsets the VC to the first N alleles when the VC contains more than the maximum number of alleles (instead of throwing it out completely as it did previously). [Perhaps the culling should be done by the UG engine? But theoretically the Exact model can be called outside of the UG and we'd still want the context subsetted.] --- .../AlleleFrequencyCalculationModel.java | 18 ++- .../genotyper/ExactAFCalculationModel.java | 26 +++- .../genotyper/UnifiedGenotyperEngine.java | 147 +++++++++--------- 3 files changed, 106 insertions(+), 85 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 681cc1fa6..9f2403bbf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.List; @@ -41,10 +41,11 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { public enum Model { /** The default model with the best performance in all cases */ - EXACT, + EXACT } protected int N; + protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE; protected Logger logger; protected PrintStream verboseWriter; @@ -53,20 +54,21 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - protected AlleleFrequencyCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) { this.N = N; + this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES; this.logger = logger; this.verboseWriter = verboseWriter; } /** * Must be overridden by concrete subclasses - * @param GLs genotype likelihoods - * @param Alleles Alleles corresponding to GLs + * @param vc variant context with alleles and genotype likelihoods * @param log10AlleleFrequencyPriors priors * @param result (pre-allocated) object to store likelihoods results + * @return the alleles used for genotyping */ - protected abstract void getLog10PNonRef(GenotypesContext GLs, List Alleles, - double[][] log10AlleleFrequencyPriors, - AlleleFrequencyCalculationResult result); + protected abstract List getLog10PNonRef(final VariantContext vc, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index d604e8d62..f9518a35c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -43,14 +43,28 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { super(UAC, N, logger, verboseWriter); } - public void getLog10PNonRef(final GenotypesContext GLs, - final List alleles, - final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - final int numAlleles = alleles.size(); + public List getLog10PNonRef(final VariantContext vc, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + final GenotypesContext GLs = vc.getGenotypes(); + List alleles = vc.getAlleles(); + + // don't try to genotype too many alternate alleles + if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) { + logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); + alleles.add(vc.getReference()); + for ( int i = 0; i < MAX_ALTERNATE_ALLELES_TO_GENOTYPE; i++ ) + alleles.add(vc.getAlternateAllele(i)); + UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false); + } //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); - linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false); + linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result, false); + + return alleles; } private static final ArrayList getGLs(GenotypesContext GLs) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 2eba6d884..aa5776007 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -295,12 +295,6 @@ public class UnifiedGenotyperEngine { } AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) { - logger.warn("the Unified Genotyper is currently set to genotype at most " + UAC.MAX_ALTERNATE_ALLELES + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + vc.getAlternateAlleles().size() + " alternate alleles; see the --max_alternate_alleles argument"); - return null; - } - // estimate our confidence in a reference call and return if ( vc.getNSamples() == 0 ) { if ( limitedContext ) @@ -313,25 +307,32 @@ public class UnifiedGenotyperEngine { // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); clearAFarray(AFresult.log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; // determine which alternate alleles have AF>0 - boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; + final List myAlleles = new ArrayList(vc.getAlleles().size()); + myAlleles.add(vc.getReference()); for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[i]); + final Allele alternateAllele = vc.getAlternateAllele(i); + final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele); + // the genotyping model may have stripped it out + if ( indexOfAllele == -1 ) + continue; + + int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]); // if the most likely AC is not 0, then this is a good alternate allele to use; // make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array - if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[i][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) { - altAllelesToUse[i] = true; + if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) { + myAlleles.add(alternateAllele); bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - altAllelesToUse[i] = true; + myAlleles.add(alternateAllele); } } @@ -367,20 +368,6 @@ public class UnifiedGenotyperEngine { return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF); } - // strip out any alleles that aren't going to be used in the VariantContext - final List myAlleles; - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { - myAlleles = new ArrayList(vc.getAlleles().size()); - myAlleles.add(vc.getReference()); - for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - if ( altAllelesToUse[i] ) - myAlleles.add(vc.getAlternateAllele(i)); - } - } else { - // use all of the alleles if we are given them by the user - myAlleles = vc.getAlleles(); - } - // start constructing the resulting VC final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); @@ -394,7 +381,7 @@ public class UnifiedGenotyperEngine { } // create the genotypes - final GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse); + final GenotypesContext genotypes = subsetAlleles(vc, myAlleles, true); // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) @@ -414,7 +401,7 @@ public class UnifiedGenotyperEngine { VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); clearAFarray(AFresult.log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + afcm.get().getLog10PNonRef(vcOverall, getAlleleFrequencyPriors(model), AFresult); //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); @@ -423,7 +410,7 @@ public class UnifiedGenotyperEngine { VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); clearAFarray(AFresult.log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero; double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); @@ -433,7 +420,7 @@ public class UnifiedGenotyperEngine { VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); clearAFarray(AFresult.log10AlleleFrequencyPosteriors); - afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero; double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); @@ -772,30 +759,36 @@ public class UnifiedGenotyperEngine { /** * @param vc variant context with genotype likelihoods - * @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use * @return genotypes */ - public static GenotypesContext assignGenotypes(final VariantContext vc, - final boolean[] allelesToUse) { + public static GenotypesContext assignGenotypes(final VariantContext vc) { + return subsetAlleles(vc, vc.getAlleles(), true); + } - // the no-called genotypes - final GenotypesContext GLs = vc.getGenotypes(); + /** + * @param vc variant context with genotype likelihoods + * @param allelesToUse which alleles from the vc are okay to use + * @param assignGenotypes true if we should change the genotypes based on the (subsetted) PLs + * @return genotypes + */ + public static GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes) { + + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); // samples - final List sampleIndices = GLs.getSampleNamesOrderedByName(); + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - // the new called genotypes to create - final GenotypesContext calls = GenotypesContext.create(); + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final int numOriginalAltAlleles = allelesToUse.length; - final List newAlleles = new ArrayList(numOriginalAltAlleles+1); - newAlleles.add(vc.getReference()); - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToUse[i] ) - newAlleles.add(vc.getAlternateAllele(i)); - } - final int numNewAltAlleles = newAlleles.size() - 1; + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + + // which PLs should be carried forward? ArrayList likelihoodIndexesToUse = null; // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, @@ -804,20 +797,27 @@ public class UnifiedGenotyperEngine { likelihoodIndexesToUse = new ArrayList(30); final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles]; + final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) + altAlleleIndexToUse[i] = true; + } + for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) { - int[] alleles = PLcache[PLindex]; + final int[] alleles = PLcache[PLindex]; // consider this entry only if both of the alleles are good - if ( (alleles[0] == 0 || allelesToUse[alleles[0] - 1]) && (alleles[1] == 0 || allelesToUse[alleles[1] - 1]) ) + if ( (alleles[0] == 0 || altAlleleIndexToUse[alleles[0] - 1]) && (alleles[1] == 0 || altAlleleIndexToUse[alleles[1] - 1]) ) likelihoodIndexesToUse.add(PLindex); } } // create the new genotypes - for ( int k = GLs.size() - 1; k >= 0; k-- ) { - final String sample = sampleIndices.get(k); - final Genotype g = GLs.get(sample); - if ( !g.hasLikelihoods() ) + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + if ( !g.hasLikelihoods() ) { + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); continue; + } // create the new likelihoods array from the alleles we are allowed to use final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); @@ -834,29 +834,34 @@ public class UnifiedGenotyperEngine { newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); } - // if there is no mass on the (new) likelihoods and we actually have alternate alleles, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { - calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + // if there is no mass on the (new) likelihoods or we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); continue; } - // find the genotype with maximum likelihoods - int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); - int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex]; - - ArrayList myAlleles = new ArrayList(); - myAlleles.add(newAlleles.get(alleles[0])); - myAlleles.add(newAlleles.get(alleles[1])); - - final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); - Map attrs = new HashMap(g.getAttributes()); - if ( numNewAltAlleles == 0 ) - attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); - else - attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); - calls.add(new Genotype(sample, myAlleles, qual, null, attrs, false)); + final Genotype newGT = assignGenotype(g, newLikelihoods, allelesToUse, numNewAltAlleles); + newGTs.add(newGT); } + + return newGTs; + } + + protected static Genotype assignGenotype(Genotype originalGT, double[] newLikelihoods, List allelesToUse, int numNewAltAlleles) { + // find the genotype with maximum likelihoods + int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex]; - return calls; + ArrayList myAlleles = new ArrayList(); + myAlleles.add(allelesToUse.get(alleles[0])); + myAlleles.add(allelesToUse.get(alleles[1])); + + final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); + Map attrs = new HashMap(originalGT.getAttributes()); + if ( numNewAltAlleles == 0 ) + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + else + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); + return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false); } }