A much better way of choosing the alternate allele(s) to genotype in the SNP model of UG: instead of looking at the sum of base qualities (which can and did lead to us over-genotyping esp. when allowing multiple alternate alleles), we look at the likelihoods themselves (free since we are already calculating likelihoods for all 10 genotypes). Now, even if the base quals exceed some arbitrary threshold, we only bother genotyping an alternate allele when there's a sample for which it is more likely than ref/ref (I can generate weird edge cases where this falls apart, but none that model truly variable sites that we actually want to call). This leads to a huge efficiency improvement esp. for exomes (and esp. for many samples) where we almost always were trying to genotype all 3 alternate alleles. Integration tests change only because ref calls have slight QUAL differences (because the best alt allele is still chosen arbitrarily, but differently).

This commit is contained in:
Eric Banks 2011-12-27 16:50:38 -05:00
parent adff40ff58
commit d20a25d681
2 changed files with 69 additions and 63 deletions

View File

@ -46,8 +46,6 @@ import java.util.*;
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private static final int MIN_QUAL_SUM_FOR_ALT_ALLELE = 50;
private boolean ALLOW_MULTIPLE_ALLELES;
private final boolean useAlleleFromVCF;
@ -56,15 +54,19 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
super(UAC, logger);
ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC;
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
// make sure the PL cache has been initialized with enough alleles
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles
UnifiedGenotyperEngine.calculatePLcache(3);
}
public VariantContext getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Allele alternateAlleleToUse,
boolean useBAQedPileup) {
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenotypePriors priors,
final Allele alternateAlleleToUse,
final boolean useBAQedPileup) {
if ( !(priors instanceof DiploidSNPGenotypePriors) )
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
@ -79,6 +81,20 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
alleles.add(Allele.create(refBase, true));
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
// calculate the GLs
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
if ( useBAQedPileup )
pileup = createBAQedPileup( pileup );
// create the GenotypeLikelihoods object
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error);
final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE);
if ( nGoodBases > 0 )
GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup)));
}
// find the alternate allele(s) that we should be using
if ( alternateAlleleToUse != null ) {
basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true;
@ -93,7 +109,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true;
} else {
determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup);
determineAlternateAlleles(basesToUse, refBase, GLs);
// how many alternate alleles are we using?
int alleleCounter = Utils.countSetBits(basesToUse);
@ -125,22 +141,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
builder.alleles(alleles);
// create the genotypes; no-call everyone for now
GenotypesContext genotypes = GenotypesContext.create();
final GenotypesContext genotypes = GenotypesContext.create();
final List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
if ( useBAQedPileup )
pileup = createBAQedPileup( pileup );
// create the GenotypeLikelihoods object
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error);
final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE);
if ( nGoodBases == 0 )
continue;
final double[] allLikelihoods = GL.getLikelihoods();
for ( SampleGenotypeData sampleData : GLs ) {
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
final double[] myLikelihoods = new double[numLikelihoods];
int myLikelihoodsIndex = 0;
@ -151,60 +157,48 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
}
// normalize in log space so that max element is zero.
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
final HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, sampleData.depth);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
genotypes.add(new Genotype(sampleData.name, noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
}
return builder.genotypes(genotypes).make();
}
// fills in the allelesToUse array
protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map<String, AlignmentContext> contexts, boolean useBAQedPileup) {
int[] qualCounts = new int[4];
protected void determineAlternateAlleles(final boolean[] allelesToUse, final byte ref, final List<SampleGenotypeData> sampleDataList) {
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
// calculate the sum of quality scores for each base
ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup();
for ( PileupElement p : pileup ) {
// ignore deletions
if ( p.isDeletion() || (!p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE) )
continue;
final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref);
final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal();
final double[] likelihoodCounts = new double[4];
final int index = BaseUtils.simpleBaseToBaseIndex(p.getBase());
if ( index >= 0 ) {
qualCounts[index] += p.getQual();
}
// based on the GLs, find the alternate alleles with the most probability
for ( SampleGenotypeData sampleData : sampleDataList ) {
final double[] likelihoods = sampleData.GL.getLikelihoods();
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
if ( PLindexOfBestGL != PLindexOfRef ) {
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL];
if ( alleles[0] != baseIndexOfRef )
likelihoodCounts[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
// don't double-count it
if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] )
likelihoodCounts[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
}
}
if ( ALLOW_MULTIPLE_ALLELES ) {
for ( byte altAllele : BaseUtils.BASES ) {
if ( altAllele == ref )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( qualCounts[index] >= MIN_QUAL_SUM_FOR_ALT_ALLELE ) {
allelesToUse[index] = true;
for ( int i = 0; i < 4; i++ ) {
if ( likelihoodCounts[i] > 0.0 ) {
allelesToUse[i] = true;
}
}
} else {
// set the non-ref base which has the maximum quality score sum
int maxCount = 0;
int indexOfMax = 0;
for ( byte altAllele : BaseUtils.BASES ) {
if ( altAllele == ref )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( qualCounts[index] > maxCount ) {
maxCount = qualCounts[index];
indexOfMax = index;
}
}
if ( maxCount > 0 )
// set the non-ref base which has the maximum sum of non-ref GLs
final int indexOfMax = MathUtils.maxElementIndex(likelihoodCounts);
if ( likelihoodCounts[indexOfMax] > 0.0 )
allelesToUse[indexOfMax] = true;
}
}
@ -227,4 +221,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
}
}
private static class SampleGenotypeData {
public final String name;
public final DiploidSNPGenotypeLikelihoods GL;
public final int depth;
public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoods GL, final int depth) {
this.name = name;
this.GL = GL;
this.depth = depth;
}
}
}

View File

@ -129,8 +129,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testOutputParameter() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "42e4ea7878ef8d96215accb3ba4e97b7" );
e.put( "--output_mode EMIT_ALL_SITES", "e0443c720149647469f2a2f3fb73942f" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" );
e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(