Optimization: cache up front the PL index to the pair of alleles it represents for all possible numbers of alternate alleles.

This commit is contained in:
Eric Banks 2011-12-14 13:38:20 -05:00
parent 988d60091f
commit 1e90d602a4
3 changed files with 40 additions and 24 deletions

View File

@ -55,7 +55,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(); // TODO -- initialize with size of GLs
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
@ -198,6 +198,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final AlleleFrequencyCalculationResult result,
final boolean preserveData) {
// make sure the PL cache has been initialized
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null )
UnifiedGenotyperEngine.calculatePLcache(5);
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
@ -415,10 +419,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// todo -- arent' there a small number of fixed values that this function can adopt?
// todo -- at a minimum it'd be good to partially compute some of these in ACCounts for performance
// todo -- need to cache PLIndex -> two alleles, compute looping over each PLIndex. Note all other operations are efficient
// todo -- this can be computed once at the start of the all operations
// the closed form representation generalized for multiple alleles is as follows:
// AA: (2j - totalK) * (2j - totalK - 1)
@ -434,25 +434,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( PLindex <= numAltAlleles )
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
int subtractor = numAltAlleles+1;
int subtractions = 0;
do {
PLindex -= subtractor;
subtractor--;
subtractions++;
}
while ( PLindex >= subtractor );
// find the 2 alternate alleles that are represented by this PL index
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex];
final int k_i = ACcounts[subtractions-1];
final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( PLindex == 0 ) {
if ( alleles[0] == alleles[1] ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[subtractions+PLindex-1];
final int k_j = ACcounts[alleles[1]-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}

View File

@ -102,6 +102,8 @@ public class UnifiedGenotyperEngine {
private final GenomeLocParser genomeLocParser;
private final boolean BAQEnabledOnCMDLine;
// a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
protected static int[][][] PLIndexToAlleleIndex;
// ---------------------------------------------------------------------------------------------------------
@ -135,6 +137,27 @@ public class UnifiedGenotyperEngine {
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
filter.add(LOW_QUAL_FILTER_NAME);
calculatePLcache(UAC.MAX_ALTERNATE_ALLELES);
}
protected static void calculatePLcache(int maxAltAlleles) {
PLIndexToAlleleIndex = new int[maxAltAlleles+1][][];
PLIndexToAlleleIndex[0] = null;
int numLikelihoods = 1;
// for each count of alternate alleles
for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) {
numLikelihoods += altAlleles + 1;
PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][];
int PLindex = 0;
// for all possible combinations of the 2 alt alleles
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 };
}
}
}
}
/**
@ -751,8 +774,8 @@ public class UnifiedGenotyperEngine {
*
* @return genotypes
*/
public GenotypesContext assignGenotypes(VariantContext vc,
boolean[] allelesToUse) {
public GenotypesContext assignGenotypes(final VariantContext vc,
final boolean[] allelesToUse) {
final GenotypesContext GLs = vc.getGenotypes();

View File

@ -83,21 +83,20 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final double[][] log10AlleleFrequencyLikelihoods = new double[2][2*numSamples+1];
final double[][] log10AlleleFrequencyPosteriors = new double[2][2*numSamples+1];
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples);
for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2*numSamples+1; j++ ) {
log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors[allele]);
int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]);
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}