Small optimizations to the joint calling code.

Thanks to profiling I noticed that the determineCoefficient() method was being called too often. Because it returns a constant result in half of the invocations, its value should be cached when possible. Also, the various calls to getLog10Likelihoods() showed up in the profiler, so I pulled those out too. All told, it speeds up the genotyping by about 10 percent according to the profiler.
2016-01-11 22:22:37 -05:00 · 2016-01-11 22:22:37 -05:00 · ab2f541d1f
parent 7f7ba446c0
commit ab2f541d1f
1 changed files with 18 additions and 11 deletions
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/DiploidExactAFCalculator.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/DiploidExactAFCalculator.java
@ -283,19 +283,29 @@ public abstract class DiploidExactAFCalculator extends ExactAFCalculator {
                          final int PLsetIndex,
                          final ArrayList<double[]> genotypeLikelihoods) {
        final int totalK = targetSet.getACsum();
+        final double[] targetLog10Likelihoods = targetSet.getLog10Likelihoods();
+        final double[] dependentLog10Likelihoods = dependentSet.getLog10Likelihoods();
+        final int[] targetACcounts = targetSet.getACcounts().getCounts();

-        for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) {
+        // skip impossible conformations, namely those for which there aren't enough possible chromosomes (2*j in the if loop below)
+        // to fill up the totalK alternate alleles needed; we do want to ensure that there's at least one sample included (hence the Math.max)
+        final int firstIndex = Math.max(1, (totalK + 1) / 2);

-            if ( totalK <= 2*j ) { // skip impossible conformations
-                final double[] gl = genotypeLikelihoods.get(j);
-                final double conformationValue =
-                        determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex];
-                targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue);
-            }
+        // find the 2 alleles that are represented by this PL index
+        final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLsetIndex);
+        // if neither allele is reference then the coefficient is constant throughout this invocation
+        final Double constantCoefficient = (alleles.alleleIndex1 == 0) ? null : determineCoefficient(alleles, 0, targetACcounts, totalK);
+
+        for ( int j = firstIndex; j < targetLog10Likelihoods.length; j++ ) {
+            final double[] gl = genotypeLikelihoods.get(j);
+            final double coefficient = (constantCoefficient != null) ? constantCoefficient : determineCoefficient(alleles, j, targetACcounts, totalK);
+
+            final double conformationValue = coefficient + dependentLog10Likelihoods[j-1] + gl[PLsetIndex];
+            targetLog10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetLog10Likelihoods[j], conformationValue);
        }
    }

-    private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
+    private double determineCoefficient(final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles, final int j, final int[] ACcounts, final int totalK) {
        // the closed form representation generalized for multiple alleles is as follows:
        // AA: (2j - totalK) * (2j - totalK - 1)
        // AB: 2k_b * (2j - totalK)
@ -304,9 +314,6 @@ public abstract class DiploidExactAFCalculator extends ExactAFCalculator {
        // BC: 2 * k_b * k_c
        // CC: k_c * (k_c - 1)

-        // find the 2 alleles that are represented by this PL index
-        GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
-
        // *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
        // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does.   ***