diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 918f83514..3a86743de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -330,14 +330,38 @@ public class UnifiedGenotyperEngine { clearAFarray(log10AlleleFrequencyPosteriors.get()); afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); - // TODO -- this is not the right thing mathematically to do! In a case of B=1,C=0 the likelihoods would get added to both AC=0 and AC=1 - double[] collapsedPosteriors = collapseAFarrays(log10AlleleFrequencyPosteriors.get(), vc.getAlternateAlleles().size()); - // is the most likely frequency conformation AC=0 for all alternate alleles? - boolean bestGuessIsRef = MathUtils.maxElementIndex(collapsedPosteriors) == 0; + boolean bestGuessIsRef = true; + + // which alternate allele has the highest MLE AC? + int indexOfHighestAlt = -1; + int alleleCountOfHighestAlt = -1; + + // determine which alternate alleles have AF>0 + boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + int indexOfBestAC = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]); + + // if the most likely AC is not 0, then this is a good alternate allele to use + if ( indexOfBestAC != 0 ) { + altAllelesToUse[i] = true; + bestGuessIsRef = false; + } + // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele + else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + altAllelesToUse[i] = true; + } + + // keep track of the "best" alternate allele to use + if ( indexOfBestAC > alleleCountOfHighestAlt) { + alleleCountOfHighestAlt = indexOfBestAC; + indexOfHighestAlt = i; + } + } // calculate p(f>0) - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(collapsedPosteriors); + // TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs + double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[indexOfHighestAlt]); double sum = 0.0; for (int i = 1; i <= N; i++) sum += normalizedPosteriors[i]; @@ -368,27 +392,18 @@ public class UnifiedGenotyperEngine { return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF); } - // strip out any alleles that aren't going to be used - Set myAlleles; - boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; + // strip out any alleles that aren't going to be used in the VariantContext + List myAlleles; if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { - myAlleles = new HashSet(vc.getAlleles().size()); + myAlleles = new ArrayList(vc.getAlleles().size()); myAlleles.add(vc.getReference()); - - // if we're making a reference call then we keep just the ref allele, otherwise we need to determine which ones are okay - if ( !bestGuessIsRef ) { - for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - if ( MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]) != 0 ) { - myAlleles.add(vc.getAlternateAllele(i)); - altAllelesToUse[i] = true; - } - } + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + if ( altAllelesToUse[i] ) + myAlleles.add(vc.getAlternateAllele(i)); } } else { // use all of the alleles if we are given them by the user - myAlleles = new HashSet(vc.getAlleles()); - for ( int i = 0; i < altAllelesToUse.length; i++ ) - altAllelesToUse[i] = true; + myAlleles = vc.getAlleles(); } // start constructing the resulting VC @@ -479,21 +494,6 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } - private static double[] collapseAFarrays(double[][] original, int numDimensions) { - int size = original[0].length; - double[] newArray = new double[size]; - for ( int i = 0; i < size; i++) - newArray[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - - for ( int i = 0; i < numDimensions; i++ ) { - for ( int j = 0; j < size; j++ ) { - newArray[j] = ExactAFCalculationModel.approximateLog10SumLog10(newArray[j], original[i][j]); - } - } - - return newArray; - } - private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { // TODO - temp fix until we can deal with extended events properly // for indels, stop location is one more than ref allele length diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index c04b0085c..605d6051c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("75e49dff01763aff2984dc86a72eb229")); + Arrays.asList("98a4d1e1e0a363ba37518563ac6cbead")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("8209a308d95659c6da7dab8733c736f9")); + Arrays.asList("915e7a3e7cbfd995dbc41fdd382d0d51")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); }