From d20a25d68175e14c972980426156fb035d3424d4 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Dec 2011 16:50:38 -0500 Subject: [PATCH] A much better way of choosing the alternate allele(s) to genotype in the SNP model of UG: instead of looking at the sum of base qualities (which can and did lead to us over-genotyping esp. when allowing multiple alternate alleles), we look at the likelihoods themselves (free since we are already calculating likelihoods for all 10 genotypes). Now, even if the base quals exceed some arbitrary threshold, we only bother genotyping an alternate allele when there's a sample for which it is more likely than ref/ref (I can generate weird edge cases where this falls apart, but none that model truly variable sites that we actually want to call). This leads to a huge efficiency improvement esp. for exomes (and esp. for many samples) where we almost always were trying to genotype all 3 alternate alleles. Integration tests change only because ref calls have slight QUAL differences (because the best alt allele is still chosen arbitrarily, but differently). --- ...NPGenotypeLikelihoodsCalculationModel.java | 128 +++++++++--------- .../UnifiedGenotyperIntegrationTest.java | 4 +- 2 files changed, 69 insertions(+), 63 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 57cc5594a..eee89674a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -46,8 +46,6 @@ import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - private static final int MIN_QUAL_SUM_FOR_ALT_ALLELE = 50; - private boolean ALLOW_MULTIPLE_ALLELES; private final boolean useAlleleFromVCF; @@ -56,15 +54,19 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC super(UAC, logger); ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC; useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + + // make sure the PL cache has been initialized with enough alleles + if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles + UnifiedGenotyperEngine.calculatePLcache(3); } - public VariantContext getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Allele alternateAlleleToUse, - boolean useBAQedPileup) { + public VariantContext getLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenotypePriors priors, + final Allele alternateAlleleToUse, + final boolean useBAQedPileup) { if ( !(priors instanceof DiploidSNPGenotypePriors) ) throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); @@ -79,6 +81,20 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC alleles.add(Allele.create(refBase, true)); final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); + // calculate the GLs + ArrayList GLs = new ArrayList(contexts.size()); + for ( Map.Entry sample : contexts.entrySet() ) { + ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if ( useBAQedPileup ) + pileup = createBAQedPileup( pileup ); + + // create the GenotypeLikelihoods object + final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); + final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); + if ( nGoodBases > 0 ) + GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup))); + } + // find the alternate allele(s) that we should be using if ( alternateAlleleToUse != null ) { basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true; @@ -93,7 +109,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true; } else { - determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup); + determineAlternateAlleles(basesToUse, refBase, GLs); // how many alternate alleles are we using? int alleleCounter = Utils.countSetBits(basesToUse); @@ -125,22 +141,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC builder.alleles(alleles); // create the genotypes; no-call everyone for now - GenotypesContext genotypes = GenotypesContext.create(); + final GenotypesContext genotypes = GenotypesContext.create(); final List noCall = new ArrayList(); noCall.add(Allele.NO_CALL); - for ( Map.Entry sample : contexts.entrySet() ) { - ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); - if ( useBAQedPileup ) - pileup = createBAQedPileup( pileup ); - - // create the GenotypeLikelihoods object - final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); - final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); - if ( nGoodBases == 0 ) - continue; - - final double[] allLikelihoods = GL.getLikelihoods(); + for ( SampleGenotypeData sampleData : GLs ) { + final double[] allLikelihoods = sampleData.GL.getLikelihoods(); final double[] myLikelihoods = new double[numLikelihoods]; int myLikelihoodsIndex = 0; @@ -151,60 +157,48 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC } // normalize in log space so that max element is zero. - GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); + final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); - HashMap attributes = new HashMap(); - attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); + final HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, sampleData.depth); attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); - genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); + genotypes.add(new Genotype(sampleData.name, noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); } return builder.genotypes(genotypes).make(); } // fills in the allelesToUse array - protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map contexts, boolean useBAQedPileup) { - int[] qualCounts = new int[4]; + protected void determineAlternateAlleles(final boolean[] allelesToUse, final byte ref, final List sampleDataList) { - for ( Map.Entry sample : contexts.entrySet() ) { - // calculate the sum of quality scores for each base - ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup(); - for ( PileupElement p : pileup ) { - // ignore deletions - if ( p.isDeletion() || (!p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE) ) - continue; + final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref); + final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal(); + final double[] likelihoodCounts = new double[4]; - final int index = BaseUtils.simpleBaseToBaseIndex(p.getBase()); - if ( index >= 0 ) { - qualCounts[index] += p.getQual(); - } + // based on the GLs, find the alternate alleles with the most probability + for ( SampleGenotypeData sampleData : sampleDataList ) { + final double[] likelihoods = sampleData.GL.getLikelihoods(); + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PLindexOfRef ) { + int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL]; + if ( alleles[0] != baseIndexOfRef ) + likelihoodCounts[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; + // don't double-count it + if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] ) + likelihoodCounts[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; } } if ( ALLOW_MULTIPLE_ALLELES ) { - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele == ref ) - continue; - int index = BaseUtils.simpleBaseToBaseIndex(altAllele); - if ( qualCounts[index] >= MIN_QUAL_SUM_FOR_ALT_ALLELE ) { - allelesToUse[index] = true; + for ( int i = 0; i < 4; i++ ) { + if ( likelihoodCounts[i] > 0.0 ) { + allelesToUse[i] = true; } } } else { - // set the non-ref base which has the maximum quality score sum - int maxCount = 0; - int indexOfMax = 0; - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele == ref ) - continue; - int index = BaseUtils.simpleBaseToBaseIndex(altAllele); - if ( qualCounts[index] > maxCount ) { - maxCount = qualCounts[index]; - indexOfMax = index; - } - } - - if ( maxCount > 0 ) + // set the non-ref base which has the maximum sum of non-ref GLs + final int indexOfMax = MathUtils.maxElementIndex(likelihoodCounts); + if ( likelihoodCounts[indexOfMax] > 0.0 ) allelesToUse[indexOfMax] = true; } } @@ -227,4 +221,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } } -} \ No newline at end of file + private static class SampleGenotypeData { + + public final String name; + public final DiploidSNPGenotypeLikelihoods GL; + public final int depth; + + public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoods GL, final int depth) { + this.name = name; + this.GL = GL; + this.depth = depth; + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 3c6131d6c..d4518078b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -129,8 +129,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "42e4ea7878ef8d96215accb3ba4e97b7" ); - e.put( "--output_mode EMIT_ALL_SITES", "e0443c720149647469f2a2f3fb73942f" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" ); + e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(