From e4a225ed092cfc2ed96d9da06c6f92299c193ac5 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 29 Mar 2012 11:07:37 -0400 Subject: [PATCH] Move the code to subset a Variant Context to fewer alleles (including restructuring the PLs appropriately) into VariantContextUtils where it can be used generally. --- .../genotyper/ExactAFCalculationModel.java | 4 +- .../genotyper/UnifiedGenotyperEngine.java | 118 +-------------- .../variantcontext/VariantContextUtils.java | 136 +++++++++++++++++- 3 files changed, 135 insertions(+), 123 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 4bda3282e..8f3e78328 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -56,7 +56,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); alleles.add(vc.getReference()); alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE)); - GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false); + GLs = VariantContextUtils.subsetAlleles(vc, alleles, false); } linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); @@ -120,7 +120,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( sample.hasLikelihoods() ) { double[] gls = sample.getLikelihoods().getAsVector(); - if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL ) + if ( MathUtils.sum(gls) < VariantContextUtils.SUM_GL_THRESH_NOCALL ) genotypeLikelihoods.add(gls); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 39745507c..3117963fb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -61,10 +61,6 @@ public class UnifiedGenotyperEngine { * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by * no means produce a comprehensive set of indels in DISCOVERY mode */ EMIT_ALL_SITES - } - - protected static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - protected static final double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. // the unified argument collection private final UnifiedArgumentCollection UAC; @@ -348,7 +344,7 @@ public class UnifiedGenotyperEngine { } // create the genotypes - final GenotypesContext genotypes = subsetAlleles(vc, myAlleles, true); + final GenotypesContext genotypes = VariantContextUtils.subsetAlleles(vc, myAlleles, true); // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) @@ -730,116 +726,4 @@ public class UnifiedGenotyperEngine { return vc; } - - /** - * @param vc variant context with genotype likelihoods - * @return genotypes - */ - public static GenotypesContext assignGenotypes(final VariantContext vc) { - return subsetAlleles(vc, vc.getAlleles(), true); - } - - /** - * @param vc variant context with genotype likelihoods - * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** - * @param assignGenotypes true if we should change the genotypes based on the (subsetted) PLs - * @return genotypes - */ - public static GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes) { - - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - - // samples - final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final int numNewAltAlleles = allelesToUse.size() - 1; - - // which PLs should be carried forward? - ArrayList likelihoodIndexesToUse = null; - - // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, - // then we can keep the PLs as is; otherwise, we determine which ones to keep - if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { - likelihoodIndexesToUse = new ArrayList(30); - - final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) - altAlleleIndexToUse[i] = true; - } - - final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles); - for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - // consider this entry only if both of the alleles are good - if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) ) - likelihoodIndexesToUse.add(PLindex); - } - } - - // create the new genotypes - for ( int k = 0; k < oldGTs.size(); k++ ) { - final Genotype g = oldGTs.get(sampleIndices.get(k)); - if ( !g.hasLikelihoods() ) { - newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); - continue; - } - - // create the new likelihoods array from the alleles we are allowed to use - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - double[] newLikelihoods; - if ( likelihoodIndexesToUse == null ) { - newLikelihoods = originalLikelihoods; - } else { - newLikelihoods = new double[likelihoodIndexesToUse.size()]; - int newIndex = 0; - for ( int oldIndex : likelihoodIndexesToUse ) - newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; - - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { - newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); - } - else { - Map attrs = new HashMap(g.getAttributes()); - if ( numNewAltAlleles == 0 ) - attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); - else - attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); - - // if we weren't asked to assign a genotype, then just no-call the sample - if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) - newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, attrs, false)); - else - newGTs.add(assignGenotype(g, newLikelihoods, allelesToUse, numNewAltAlleles, attrs)); - } - } - - return newGTs; - } - - protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List allelesToUse, final int numNewAltAlleles, final Map attrs) { - // find the genotype with maximum likelihoods - int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - - ArrayList myAlleles = new ArrayList(); - myAlleles.add(allelesToUse.get(alleles.alleleIndex1)); - myAlleles.add(allelesToUse.get(alleles.alleleIndex2)); - - final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); - return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false); - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index e9a12ff26..07e222906 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -30,10 +30,7 @@ import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -1066,4 +1063,135 @@ public class VariantContextUtils { names.add(g.getSampleName()); return names; } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param vc variant context with genotype likelihoods + * @return genotypes context + */ + public static GenotypesContext assignGenotypes(final VariantContext vc) { + return subsetAlleles(vc, vc.getAlleles(), true); + } + + private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + public static final double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + + /** + * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) + * + * @param vc variant context with genotype likelihoods + * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** + * @param assignGenotypes true if we should update the genotypes based on the (subsetted) PLs + * @return genotypes + */ + public static GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes) { + + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + + // samples + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + + // which PLs should be carried forward? + ArrayList likelihoodIndexesToUse = null; + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { + likelihoodIndexesToUse = new ArrayList(30); + + final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) + altAlleleIndexToUse[i] = true; + } + + final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles); + for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + // consider this entry only if both of the alleles are good + if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) ) + likelihoodIndexesToUse.add(PLindex); + } + } + + // create the new genotypes + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + if ( !g.hasLikelihoods() ) { + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + continue; + } + + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + double[] newLikelihoods; + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // if there is no mass on the (new) likelihoods, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + } + else { + Map attrs = new HashMap(g.getAttributes()); + if ( numNewAltAlleles == 0 ) + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + else + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); + + // if we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, attrs, false)); + else + newGTs.add(assignGenotype(g, newLikelihoods, allelesToUse, attrs)); + } + } + + return newGTs; + } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param originalGT the original genotype + * @param newLikelihoods the PL array + * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) + * @param attrs the annotations to use when creating the genotype + * + * @return genotype + */ + private static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List allelesToUse, final Map attrs) { + final int numNewAltAlleles = allelesToUse.size() - 1; + + // find the genotype with maximum likelihoods + int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + ArrayList myAlleles = new ArrayList(); + myAlleles.add(allelesToUse.get(alleles.alleleIndex1)); + myAlleles.add(allelesToUse.get(alleles.alleleIndex2)); + + final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); + return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false); + } }