Don't collapse likelihoods over all alt alleles - that's just not right. For now, the QUAL is calculated for just the most likely of the alt alleles; I need to think about the right way to handle this properly.
This commit is contained in:
parent
364f1a030b
commit
044f211a30
|
|
@ -330,14 +330,38 @@ public class UnifiedGenotyperEngine {
|
||||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||||
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
|
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
|
||||||
|
|
||||||
// TODO -- this is not the right thing mathematically to do! In a case of B=1,C=0 the likelihoods would get added to both AC=0 and AC=1
|
|
||||||
double[] collapsedPosteriors = collapseAFarrays(log10AlleleFrequencyPosteriors.get(), vc.getAlternateAlleles().size());
|
|
||||||
|
|
||||||
// is the most likely frequency conformation AC=0 for all alternate alleles?
|
// is the most likely frequency conformation AC=0 for all alternate alleles?
|
||||||
boolean bestGuessIsRef = MathUtils.maxElementIndex(collapsedPosteriors) == 0;
|
boolean bestGuessIsRef = true;
|
||||||
|
|
||||||
|
// which alternate allele has the highest MLE AC?
|
||||||
|
int indexOfHighestAlt = -1;
|
||||||
|
int alleleCountOfHighestAlt = -1;
|
||||||
|
|
||||||
|
// determine which alternate alleles have AF>0
|
||||||
|
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
|
||||||
|
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||||
|
int indexOfBestAC = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]);
|
||||||
|
|
||||||
|
// if the most likely AC is not 0, then this is a good alternate allele to use
|
||||||
|
if ( indexOfBestAC != 0 ) {
|
||||||
|
altAllelesToUse[i] = true;
|
||||||
|
bestGuessIsRef = false;
|
||||||
|
}
|
||||||
|
// if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
|
||||||
|
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||||
|
altAllelesToUse[i] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep track of the "best" alternate allele to use
|
||||||
|
if ( indexOfBestAC > alleleCountOfHighestAlt) {
|
||||||
|
alleleCountOfHighestAlt = indexOfBestAC;
|
||||||
|
indexOfHighestAlt = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// calculate p(f>0)
|
// calculate p(f>0)
|
||||||
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(collapsedPosteriors);
|
// TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs
|
||||||
|
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[indexOfHighestAlt]);
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
for (int i = 1; i <= N; i++)
|
for (int i = 1; i <= N; i++)
|
||||||
sum += normalizedPosteriors[i];
|
sum += normalizedPosteriors[i];
|
||||||
|
|
@ -368,27 +392,18 @@ public class UnifiedGenotyperEngine {
|
||||||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
|
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
|
||||||
}
|
}
|
||||||
|
|
||||||
// strip out any alleles that aren't going to be used
|
// strip out any alleles that aren't going to be used in the VariantContext
|
||||||
Set<Allele> myAlleles;
|
List<Allele> myAlleles;
|
||||||
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
|
|
||||||
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
|
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
|
||||||
myAlleles = new HashSet<Allele>(vc.getAlleles().size());
|
myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
|
||||||
myAlleles.add(vc.getReference());
|
myAlleles.add(vc.getReference());
|
||||||
|
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||||
// if we're making a reference call then we keep just the ref allele, otherwise we need to determine which ones are okay
|
if ( altAllelesToUse[i] )
|
||||||
if ( !bestGuessIsRef ) {
|
myAlleles.add(vc.getAlternateAllele(i));
|
||||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
|
||||||
if ( MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]) != 0 ) {
|
|
||||||
myAlleles.add(vc.getAlternateAllele(i));
|
|
||||||
altAllelesToUse[i] = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// use all of the alleles if we are given them by the user
|
// use all of the alleles if we are given them by the user
|
||||||
myAlleles = new HashSet<Allele>(vc.getAlleles());
|
myAlleles = vc.getAlleles();
|
||||||
for ( int i = 0; i < altAllelesToUse.length; i++ )
|
|
||||||
altAllelesToUse[i] = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// start constructing the resulting VC
|
// start constructing the resulting VC
|
||||||
|
|
@ -479,21 +494,6 @@ public class UnifiedGenotyperEngine {
|
||||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
|
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static double[] collapseAFarrays(double[][] original, int numDimensions) {
|
|
||||||
int size = original[0].length;
|
|
||||||
double[] newArray = new double[size];
|
|
||||||
for ( int i = 0; i < size; i++)
|
|
||||||
newArray[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
|
||||||
|
|
||||||
for ( int i = 0; i < numDimensions; i++ ) {
|
|
||||||
for ( int j = 0; j < size; j++ ) {
|
|
||||||
newArray[j] = ExactAFCalculationModel.approximateLog10SumLog10(newArray[j], original[i][j]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return newArray;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
|
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
|
||||||
// TODO - temp fix until we can deal with extended events properly
|
// TODO - temp fix until we can deal with extended events properly
|
||||||
// for indels, stop location is one more than ref allele length
|
// for indels, stop location is one more than ref allele length
|
||||||
|
|
|
||||||
|
|
@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
|
||||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
|
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
|
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
|
||||||
Arrays.asList("75e49dff01763aff2984dc86a72eb229"));
|
Arrays.asList("98a4d1e1e0a363ba37518563ac6cbead"));
|
||||||
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
|
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
||||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
|
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||||
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
|
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
|
||||||
Arrays.asList("8209a308d95659c6da7dab8733c736f9"));
|
Arrays.asList("915e7a3e7cbfd995dbc41fdd382d0d51"));
|
||||||
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
|
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue