Based on Ryan's suggestion, there's a new contract for genotyping multiple alleles. Now the requester submits alleles in any arbitrary order - rankings aren't needed. If the Exact model decides that it needs to subset the alleles because too many were requested, it does so based on PL mass (in other words, I moved this code from the SNPGenotypeLikelihoodsCalculationModel to the Exact model). Now subsetting alleles is consistent.
This commit is contained in:
parent
7a937dd1eb
commit
f53cd3de1b
|
|
@ -56,8 +56,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
alleles.add(vc.getReference());
|
||||
for ( int i = 0; i < MAX_ALTERNATE_ALLELES_TO_GENOTYPE; i++ )
|
||||
alleles.add(vc.getAlternateAllele(i));
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE));
|
||||
GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false);
|
||||
}
|
||||
|
||||
|
|
@ -67,6 +66,58 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
return alleles;
|
||||
}
|
||||
|
||||
private static final class LikelihoodSum implements Comparable<LikelihoodSum> {
|
||||
public double sum = 0.0;
|
||||
public Allele allele;
|
||||
|
||||
public LikelihoodSum(Allele allele) { this.allele = allele; }
|
||||
|
||||
public int compareTo(LikelihoodSum other) {
|
||||
final double diff = sum - other.sum;
|
||||
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static final int PL_INDEX_OF_HOM_REF = 0;
|
||||
private static final List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// make sure that we've cached enough data
|
||||
if ( numOriginalAltAlleles > UnifiedGenotyperEngine.PLIndexToAlleleIndex.length - 1 )
|
||||
UnifiedGenotyperEngine.calculatePLcache(numOriginalAltAlleles);
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numOriginalAltAlleles][PLindexOfBestGL];
|
||||
if ( alleles[0] != 0 )
|
||||
likelihoodSums[alleles[0]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
// don't double-count it
|
||||
if ( alleles[1] != 0 && alleles[1] != alleles[0] )
|
||||
likelihoodSums[alleles[1]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
}
|
||||
}
|
||||
|
||||
// sort them by probability mass and choose the best ones
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( int i = 0; i < numAllelesToChoose; i++ )
|
||||
bestAlleles.add(likelihoodSums[i].allele);
|
||||
|
||||
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
if ( bestAlleles.contains(allele) )
|
||||
orderedBestAlleles.add(allele);
|
||||
}
|
||||
|
||||
return orderedBestAlleles;
|
||||
}
|
||||
|
||||
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
|
||||
|
||||
|
|
|
|||
|
|
@ -45,20 +45,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
|
||||
private final boolean useAlleleFromVCF;
|
||||
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[4];
|
||||
|
||||
private final class LikelihoodSum implements Comparable<LikelihoodSum> {
|
||||
public double sum = 0.0;
|
||||
public Allele base;
|
||||
|
||||
public LikelihoodSum(Allele base) { this.base = base; }
|
||||
|
||||
public int compareTo(LikelihoodSum other) {
|
||||
final double diff = sum - other.sum;
|
||||
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
private final double[] likelihoodSums = new double[4];
|
||||
|
||||
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
|
|
@ -176,27 +164,26 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||
final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal();
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability
|
||||
likelihoodSums[i] = 0.0;
|
||||
|
||||
// based on the GLs, find the alternate alleles with enough probability
|
||||
for ( SampleGenotypeData sampleData : sampleDataList ) {
|
||||
final double[] likelihoods = sampleData.GL.getLikelihoods();
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PLindexOfRef ) {
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL];
|
||||
if ( alleles[0] != baseIndexOfRef )
|
||||
likelihoodSums[alleles[0]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
likelihoodSums[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
// don't double-count it
|
||||
if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] )
|
||||
likelihoodSums[alleles[1]].sum += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
likelihoodSums[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
}
|
||||
}
|
||||
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final List<Allele> allelesToUse = new ArrayList<Allele>(3);
|
||||
for ( LikelihoodSum sum : likelihoodSums ) {
|
||||
if ( sum.sum > 0.0 )
|
||||
allelesToUse.add(sum.base);
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
if ( likelihoodSums[i] > 0.0 )
|
||||
allelesToUse.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false));
|
||||
}
|
||||
|
||||
return allelesToUse;
|
||||
|
|
|
|||
|
|
@ -767,7 +767,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
/**
|
||||
* @param vc variant context with genotype likelihoods
|
||||
* @param allelesToUse which alleles from the vc are okay to use
|
||||
* @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC ***
|
||||
* @param assignGenotypes true if we should change the genotypes based on the (subsetted) PLs
|
||||
* @return genotypes
|
||||
*/
|
||||
|
|
@ -860,7 +860,7 @@ public class UnifiedGenotyperEngine {
|
|||
return newGTs;
|
||||
}
|
||||
|
||||
protected static Genotype assignGenotype(Genotype originalGT, double[] newLikelihoods, List<Allele> allelesToUse, int numNewAltAlleles, Map<String, Object> attrs) {
|
||||
protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List<Allele> allelesToUse, final int numNewAltAlleles, final Map<String, Object> attrs) {
|
||||
// find the genotype with maximum likelihoods
|
||||
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
||||
int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex];
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("9ab4e98ce437a1c5e1eee338de49ee7e"));
|
||||
Arrays.asList("202b337ebbea3def1be8495eb363dfa8"));
|
||||
executeTest("test MultiSample Pilot1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("aabc4b3a312aba18b78e14750d8c8e62"));
|
||||
Arrays.asList("b53cb55a5f868663068812b13578af57"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -300,7 +300,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
|
||||
Arrays.asList("e4d2904b406f37d99fbe8f52ae75254f"));
|
||||
Arrays.asList("c9897b80615c53a4ea10a4b193d56d9c"));
|
||||
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
|
||||
}
|
||||
|
||||
|
|
@ -309,7 +309,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
|
||||
Arrays.asList("21f7b6c8b7eaccad1754a832bac79a65"));
|
||||
Arrays.asList("5282fdb1711a532d726c13507bf80a21"));
|
||||
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue