From 1e90d602a4c1628a0dde9764466f2c41c8aa8f9b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 13:38:20 -0500 Subject: [PATCH] Optimization: cache up front the PL index to the pair of alleles it represents for all possible numbers of alternate alleles. --- .../genotyper/ExactAFCalculationModel.java | 26 +++++++----------- .../genotyper/UnifiedGenotyperEngine.java | 27 +++++++++++++++++-- .../ExactAFCalculationModelUnitTest.java | 11 ++++---- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 33634ce28..bc27916dd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -55,7 +55,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } private static final ArrayList getGLs(GenotypesContext GLs) { - ArrayList genotypeLikelihoods = new ArrayList(); // TODO -- initialize with size of GLs + ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { @@ -198,6 +198,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final AlleleFrequencyCalculationResult result, final boolean preserveData) { + // make sure the PL cache has been initialized + if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null ) + UnifiedGenotyperEngine.calculatePLcache(5); + final ArrayList genotypeLikelihoods = getGLs(GLs); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -415,10 +419,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { - // todo -- arent' there a small number of fixed values that this function can adopt? - // todo -- at a minimum it'd be good to partially compute some of these in ACCounts for performance - // todo -- need to cache PLIndex -> two alleles, compute looping over each PLIndex. Note all other operations are efficient - // todo -- this can be computed once at the start of the all operations // the closed form representation generalized for multiple alleles is as follows: // AA: (2j - totalK) * (2j - totalK - 1) @@ -434,25 +434,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( PLindex <= numAltAlleles ) return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK]; - int subtractor = numAltAlleles+1; - int subtractions = 0; - do { - PLindex -= subtractor; - subtractor--; - subtractions++; - } - while ( PLindex >= subtractor ); + // find the 2 alternate alleles that are represented by this PL index + int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex]; - final int k_i = ACcounts[subtractions-1]; + final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele // the hom var case (e.g. BB, CC, DD) final double coeff; - if ( PLindex == 0 ) { + if ( alleles[0] == alleles[1] ) { coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; } // the het non-ref case (e.g. BC, BD, CD) else { - final int k_j = ACcounts[subtractions+PLindex-1]; + final int k_j = ACcounts[alleles[1]-1]; coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 5d271cdb1..221d00410 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -102,6 +102,8 @@ public class UnifiedGenotyperEngine { private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; + // a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + protected static int[][][] PLIndexToAlleleIndex; // --------------------------------------------------------------------------------------------------------- @@ -135,6 +137,27 @@ public class UnifiedGenotyperEngine { genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); filter.add(LOW_QUAL_FILTER_NAME); + calculatePLcache(UAC.MAX_ALTERNATE_ALLELES); + } + + protected static void calculatePLcache(int maxAltAlleles) { + PLIndexToAlleleIndex = new int[maxAltAlleles+1][][]; + PLIndexToAlleleIndex[0] = null; + int numLikelihoods = 1; + + // for each count of alternate alleles + for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) { + numLikelihoods += altAlleles + 1; + PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][]; + int PLindex = 0; + + // for all possible combinations of the 2 alt alleles + for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) { + for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) { + PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 }; + } + } + } } /** @@ -751,8 +774,8 @@ public class UnifiedGenotyperEngine { * * @return genotypes */ - public GenotypesContext assignGenotypes(VariantContext vc, - boolean[] allelesToUse) { + public GenotypesContext assignGenotypes(final VariantContext vc, + final boolean[] allelesToUse) { final GenotypesContext GLs = vc.getGenotypes(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index 9640a8963..dec6ecb79 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -83,21 +83,20 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - final double[][] log10AlleleFrequencyLikelihoods = new double[2][2*numSamples+1]; - final double[][] log10AlleleFrequencyPosteriors = new double[2][2*numSamples+1]; + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples); for ( int i = 0; i < 2; i++ ) { for ( int j = 0; j < 2*numSamples+1; j++ ) { - log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; } } - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false); + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors[allele]); + int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]); Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } }