Optimization: cache up front the PL index to the pair of alleles it represents for all possible numbers of alternate alleles.

This commit is contained in:
Eric Banks 2011-12-14 13:38:20 -05:00
parent 988d60091f
commit 1e90d602a4
3 changed files with 40 additions and 24 deletions

View File

@ -55,7 +55,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) { private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(); // TODO -- initialize with size of GLs ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
@ -198,6 +198,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final AlleleFrequencyCalculationResult result, final AlleleFrequencyCalculationResult result,
final boolean preserveData) { final boolean preserveData) {
// make sure the PL cache has been initialized
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null )
UnifiedGenotyperEngine.calculatePLcache(5);
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs); final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1; final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples; final int numChr = 2*numSamples;
@ -415,10 +419,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// todo -- arent' there a small number of fixed values that this function can adopt?
// todo -- at a minimum it'd be good to partially compute some of these in ACCounts for performance
// todo -- need to cache PLIndex -> two alleles, compute looping over each PLIndex. Note all other operations are efficient
// todo -- this can be computed once at the start of the all operations
// the closed form representation generalized for multiple alleles is as follows: // the closed form representation generalized for multiple alleles is as follows:
// AA: (2j - totalK) * (2j - totalK - 1) // AA: (2j - totalK) * (2j - totalK - 1)
@ -434,25 +434,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( PLindex <= numAltAlleles ) if ( PLindex <= numAltAlleles )
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK]; return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
int subtractor = numAltAlleles+1; // find the 2 alternate alleles that are represented by this PL index
int subtractions = 0; int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex];
do {
PLindex -= subtractor;
subtractor--;
subtractions++;
}
while ( PLindex >= subtractor );
final int k_i = ACcounts[subtractions-1]; final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele
// the hom var case (e.g. BB, CC, DD) // the hom var case (e.g. BB, CC, DD)
final double coeff; final double coeff;
if ( PLindex == 0 ) { if ( alleles[0] == alleles[1] ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
} }
// the het non-ref case (e.g. BC, BD, CD) // the het non-ref case (e.g. BC, BD, CD)
else { else {
final int k_j = ACcounts[subtractions+PLindex-1]; final int k_j = ACcounts[alleles[1]-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
} }

View File

@ -102,6 +102,8 @@ public class UnifiedGenotyperEngine {
private final GenomeLocParser genomeLocParser; private final GenomeLocParser genomeLocParser;
private final boolean BAQEnabledOnCMDLine; private final boolean BAQEnabledOnCMDLine;
// a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
protected static int[][][] PLIndexToAlleleIndex;
// --------------------------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------------------------
@ -135,6 +137,27 @@ public class UnifiedGenotyperEngine {
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
filter.add(LOW_QUAL_FILTER_NAME); filter.add(LOW_QUAL_FILTER_NAME);
calculatePLcache(UAC.MAX_ALTERNATE_ALLELES);
}
protected static void calculatePLcache(int maxAltAlleles) {
PLIndexToAlleleIndex = new int[maxAltAlleles+1][][];
PLIndexToAlleleIndex[0] = null;
int numLikelihoods = 1;
// for each count of alternate alleles
for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) {
numLikelihoods += altAlleles + 1;
PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][];
int PLindex = 0;
// for all possible combinations of the 2 alt alleles
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 };
}
}
}
} }
/** /**
@ -751,8 +774,8 @@ public class UnifiedGenotyperEngine {
* *
* @return genotypes * @return genotypes
*/ */
public GenotypesContext assignGenotypes(VariantContext vc, public GenotypesContext assignGenotypes(final VariantContext vc,
boolean[] allelesToUse) { final boolean[] allelesToUse) {
final GenotypesContext GLs = vc.getGenotypes(); final GenotypesContext GLs = vc.getGenotypes();

View File

@ -83,21 +83,20 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs") @Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) { public void testGLs(GetGLsTest cfg) {
final double[][] log10AlleleFrequencyLikelihoods = new double[2][2*numSamples+1]; final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples);
final double[][] log10AlleleFrequencyPosteriors = new double[2][2*numSamples+1];
for ( int i = 0; i < 2; i++ ) { for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2*numSamples+1; j++ ) { for ( int j = 0; j < 2*numSamples+1; j++ ) {
log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
} }
} }
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false); ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false);
int nameIndex = 1; int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors[allele]); int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]);
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
} }
} }