Don't collapse likelihoods over all alt alleles - that's just not right. For now, the QUAL is calculated for just the most likely of the alt alleles; I need to think about the right way to handle this properly.

This commit is contained in:
Eric Banks 2011-12-10 23:57:14 -05:00
parent 364f1a030b
commit 044f211a30
2 changed files with 38 additions and 38 deletions

View File

@ -330,14 +330,38 @@ public class UnifiedGenotyperEngine {
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
// TODO -- this is not the right thing mathematically to do! In a case of B=1,C=0 the likelihoods would get added to both AC=0 and AC=1
double[] collapsedPosteriors = collapseAFarrays(log10AlleleFrequencyPosteriors.get(), vc.getAlternateAlleles().size());
// is the most likely frequency conformation AC=0 for all alternate alleles?
boolean bestGuessIsRef = MathUtils.maxElementIndex(collapsedPosteriors) == 0;
boolean bestGuessIsRef = true;
// which alternate allele has the highest MLE AC?
int indexOfHighestAlt = -1;
int alleleCountOfHighestAlt = -1;
// determine which alternate alleles have AF>0
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
int indexOfBestAC = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]);
// if the most likely AC is not 0, then this is a good alternate allele to use
if ( indexOfBestAC != 0 ) {
altAllelesToUse[i] = true;
bestGuessIsRef = false;
}
// if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
altAllelesToUse[i] = true;
}
// keep track of the "best" alternate allele to use
if ( indexOfBestAC > alleleCountOfHighestAlt) {
alleleCountOfHighestAlt = indexOfBestAC;
indexOfHighestAlt = i;
}
}
// calculate p(f>0)
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(collapsedPosteriors);
// TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[indexOfHighestAlt]);
double sum = 0.0;
for (int i = 1; i <= N; i++)
sum += normalizedPosteriors[i];
@ -368,27 +392,18 @@ public class UnifiedGenotyperEngine {
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
}
// strip out any alleles that aren't going to be used
Set<Allele> myAlleles;
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
// strip out any alleles that aren't going to be used in the VariantContext
List<Allele> myAlleles;
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
myAlleles = new HashSet<Allele>(vc.getAlleles().size());
myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
myAlleles.add(vc.getReference());
// if we're making a reference call then we keep just the ref allele, otherwise we need to determine which ones are okay
if ( !bestGuessIsRef ) {
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
if ( MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]) != 0 ) {
myAlleles.add(vc.getAlternateAllele(i));
altAllelesToUse[i] = true;
}
}
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
if ( altAllelesToUse[i] )
myAlleles.add(vc.getAlternateAllele(i));
}
} else {
// use all of the alleles if we are given them by the user
myAlleles = new HashSet<Allele>(vc.getAlleles());
for ( int i = 0; i < altAllelesToUse.length; i++ )
altAllelesToUse[i] = true;
myAlleles = vc.getAlleles();
}
// start constructing the resulting VC
@ -479,21 +494,6 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
private static double[] collapseAFarrays(double[][] original, int numDimensions) {
int size = original[0].length;
double[] newArray = new double[size];
for ( int i = 0; i < size; i++)
newArray[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
for ( int i = 0; i < numDimensions; i++ ) {
for ( int j = 0; j < size; j++ ) {
newArray[j] = ExactAFCalculationModel.approximateLog10SumLog10(newArray[j], original[i][j]);
}
}
return newArray;
}
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
// TODO - temp fix until we can deal with extended events properly
// for indels, stop location is one more than ref allele length

View File

@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
Arrays.asList("75e49dff01763aff2984dc86a72eb229"));
Arrays.asList("98a4d1e1e0a363ba37518563ac6cbead"));
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
}
@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
Arrays.asList("8209a308d95659c6da7dab8733c736f9"));
Arrays.asList("915e7a3e7cbfd995dbc41fdd382d0d51"));
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
}