diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index d833e9f8e..ed737064d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -56,8 +56,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); alleles.add(vc.getReference()); - for ( int i = 0; i < MAX_ALTERNATE_ALLELES_TO_GENOTYPE; i++ ) - alleles.add(vc.getAlternateAllele(i)); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE)); GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false); } @@ -67,6 +66,58 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return alleles; } + private static final class LikelihoodSum implements Comparable { + public double sum = 0.0; + public Allele allele; + + public LikelihoodSum(Allele allele) { this.allele = allele; } + + public int compareTo(LikelihoodSum other) { + final double diff = sum - other.sum; + return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; + } + } + + private static final int PL_INDEX_OF_HOM_REF = 0; + private static final List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) + likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); + + // make sure that we've cached enough data + if ( numOriginalAltAlleles > UnifiedGenotyperEngine.PLIndexToAlleleIndex.length - 1 ) + UnifiedGenotyperEngine.calculatePLcache(numOriginalAltAlleles); + + // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype + final ArrayList GLs = getGLs(vc.getGenotypes()); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { + int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numOriginalAltAlleles][PLindexOfBestGL]; + if ( alleles[0] != 0 ) + likelihoodSums[alleles[0]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + // don't double-count it + if ( alleles[1] != 0 && alleles[1] != alleles[0] ) + likelihoodSums[alleles[1]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + } + } + + // sort them by probability mass and choose the best ones + Collections.sort(Arrays.asList(likelihoodSums)); + final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); + for ( int i = 0; i < numAllelesToChoose; i++ ) + bestAlleles.add(likelihoodSums[i].allele); + + final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); + for ( Allele allele : vc.getAlternateAlleles() ) { + if ( bestAlleles.contains(allele) ) + orderedBestAlleles.add(allele); + } + + return orderedBestAlleles; + } + private static final ArrayList getGLs(GenotypesContext GLs) { ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 6f1f86c6d..c078be2f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -45,20 +45,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC private final boolean useAlleleFromVCF; - final LikelihoodSum[] likelihoodSums = new LikelihoodSum[4]; - - private final class LikelihoodSum implements Comparable { - public double sum = 0.0; - public Allele base; - - public LikelihoodSum(Allele base) { this.base = base; } - - public int compareTo(LikelihoodSum other) { - final double diff = sum - other.sum; - return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; - } - } - + private final double[] likelihoodSums = new double[4]; + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; @@ -176,27 +164,26 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref); final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal(); for ( int i = 0; i < 4; i++ ) - likelihoodSums[i] = new LikelihoodSum(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false)); - - // based on the GLs, find the alternate alleles with the most probability + likelihoodSums[i] = 0.0; + + // based on the GLs, find the alternate alleles with enough probability for ( SampleGenotypeData sampleData : sampleDataList ) { final double[] likelihoods = sampleData.GL.getLikelihoods(); final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); if ( PLindexOfBestGL != PLindexOfRef ) { int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL]; if ( alleles[0] != baseIndexOfRef ) - likelihoodSums[alleles[0]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; + likelihoodSums[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; // don't double-count it if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] ) - likelihoodSums[alleles[1]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; + likelihoodSums[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; } } - Collections.sort(Arrays.asList(likelihoodSums)); final List allelesToUse = new ArrayList(3); - for ( LikelihoodSum sum : likelihoodSums ) { - if ( sum.sum > 0.0 ) - allelesToUse.add(sum.base); + for ( int i = 0; i < 4; i++ ) { + if ( likelihoodSums[i] > 0.0 ) + allelesToUse.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false)); } return allelesToUse; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index c84c944b8..0156890ac 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -767,7 +767,7 @@ public class UnifiedGenotyperEngine { /** * @param vc variant context with genotype likelihoods - * @param allelesToUse which alleles from the vc are okay to use + * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** * @param assignGenotypes true if we should change the genotypes based on the (subsetted) PLs * @return genotypes */ @@ -860,7 +860,7 @@ public class UnifiedGenotyperEngine { return newGTs; } - protected static Genotype assignGenotype(Genotype originalGT, double[] newLikelihoods, List allelesToUse, int numNewAltAlleles, Map attrs) { + protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List allelesToUse, final int numNewAltAlleles, final Map attrs) { // find the genotype with maximum likelihoods int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex]; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 125242a2f..fc4f0f46b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("9ab4e98ce437a1c5e1eee338de49ee7e")); + Arrays.asList("202b337ebbea3def1be8495eb363dfa8")); executeTest("test MultiSample Pilot1", spec); } @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("aabc4b3a312aba18b78e14750d8c8e62")); + Arrays.asList("b53cb55a5f868663068812b13578af57")); executeTest("test Multiple SNP alleles", spec); } @@ -300,7 +300,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("e4d2904b406f37d99fbe8f52ae75254f")); + Arrays.asList("c9897b80615c53a4ea10a4b193d56d9c")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -309,7 +309,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("21f7b6c8b7eaccad1754a832bac79a65")); + Arrays.asList("5282fdb1711a532d726c13507bf80a21")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); }