Optimizations:

1. Only do calculations in UG for alternate allele with highest sum of quality scores (note that this also constitutes a bug fix for a precision problem we were having). 2. Avoid using Strings in DiploidGenotype when we can (it was taking 1.5% of my compute according to JProfiler) UG now runs in half the time for JOINT_ESTIMATE model. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2141 348d0f76-0448-11de-a6fe-93d51630548a
2009-11-24 16:27:39 +00:00 · 2009-11-24 16:27:39 +00:00 · b3f561710f
parent a59e5b5e1a
commit b3f561710f
3 changed files with 78 additions and 43 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
@ -9,6 +9,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import java.util.*;
 import java.io.PrintWriter;

+import net.sf.samtools.SAMRecord;
+
 public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel {

    // because the null allele frequencies are constant for a given N,
@ -17,7 +19,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc

    // because the Hardy-Weinberg values for a given frequency are constant,
    // we cache the results to avoid having to recompute everything
-    private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();
+    // private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();

    // the allele frequency priors
    protected double[] log10AlleleFrequencyPriors;
@ -27,6 +29,9 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
    protected double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][];
    protected double[] PofFs = new double[BaseUtils.BASES.length];

+    // the alternate allele with the largest sum of quality scores
+    protected Character bestAlternateAllele = null;
+

    protected JointEstimateGenotypeCalculationModel() {}

@ -40,6 +45,12 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
        int numSamples = getNSamples(contexts);
        int frequencyEstimationPoints = (2 * numSamples) + 1;  // (add 1 for allele frequency of zero)

+        // find the alternate allele with the largest sum of quality scores
+        initializeBestAlternateAllele(ref, context);
+        // if there are no non-ref bases and we don't want all bases, then we can just return
+        if ( !ALL_BASE_MODE && bestAlternateAllele == null )
+            return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
+
        initializeAlleleFrequencies(frequencyEstimationPoints);

        initialize(ref, contexts, StratifiedContext.OVERALL);
@ -61,6 +72,39 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
        return contexts.size();
    }

+    protected void initializeBestAlternateAllele(char ref, AlignmentContext context) {
+        int[] qualCounts = new int[4];
+
+        // calculate the sum of quality scores for each base
+        List<SAMRecord> reads = context.getReads();
+        List<Integer> offsets = context.getOffsets();
+        for (int i = 0; i < reads.size(); i++) {
+            int offset = offsets.get(i);
+            // ignore deletions
+            if ( offset == -1 )
+                continue;
+
+            SAMRecord read = reads.get(i);
+            char base = (char)read.getReadBases()[offset];
+            int index = BaseUtils.simpleBaseToBaseIndex(base);
+            if ( index >= 0 )
+                qualCounts[index] += read.getBaseQualities()[offset];
+        }
+
+        // set the non-ref base with maximum quality score sum
+        int maxCount = 0;
+        bestAlternateAllele = null;
+        for ( char altAllele : BaseUtils.BASES ) {
+            if ( altAllele == ref )
+                continue;
+            int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
+            if ( qualCounts[index] > maxCount ) {
+                maxCount = qualCounts[index];
+                bestAlternateAllele = altAllele;
+            }
+        }
+    }
+
    protected void initializeVerboseWriter(PrintWriter verboseWriter) {
        StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t");
        for ( char altAllele : BaseUtils.BASES ) {
@ -122,13 +166,8 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
            log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints];
        }

-        // for each alternate allele
-        for ( char altAllele : BaseUtils.BASES ) {
-            if ( altAllele == ref )
-                continue;
-
-            calculatelog10PofDgivenAFforAllF(ref, altAllele, frequencyEstimationPoints-1, contexts, contextType);
-        }
+        // only calculate for the most likely alternate allele
+        calculatelog10PofDgivenAFforAllF(ref, bestAlternateAllele, frequencyEstimationPoints-1, contexts, contextType);
    }

    /********************************************************************************/
@ -174,26 +213,22 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
     * @param frequencyEstimationPoints   number of allele frequencies
     */
    protected void calculatePofFs(char ref, int frequencyEstimationPoints) {
-        // for each alternate allele
-        for ( char altAllele : BaseUtils.BASES ) {
-            if ( altAllele == ref )
-                continue;
+        // only calculate for the most likely alternate allele
+        int baseIndex = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);

-            int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
+        // multiply by null allele frequency priors to get AF posteriors, then normalize
+        for (int i = 0; i < frequencyEstimationPoints; i++)
+            alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
+        alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);

-            // multiply by null allele frequency priors to get AF posteriors, then normalize
-            for (int i = 0; i < frequencyEstimationPoints; i++)
-                alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
-            alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);
-
-            // calculate p(f>0)
-            double sum = 0.0;
-            for (int i = 1; i < frequencyEstimationPoints; i++)
-                sum += alleleFrequencyPosteriors[baseIndex][i];
-            PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
-        }
+        // calculate p(f>0)
+        double sum = 0.0;
+        for (int i = 1; i < frequencyEstimationPoints; i++)
+            sum += alleleFrequencyPosteriors[baseIndex][i];
+        PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
    }

+    /***
    protected double[] getHardyWeinbergValues(double f) {
        double[] HWvalues = hardyWeinbergValueCache.get(f);

@ -219,7 +254,13 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc

        return HWvalues;
    }
+    ***/

+    /**
+     * @param ref                         the ref base
+     * @param loc                         the GenomeLoc
+     * @param frequencyEstimationPoints   number of allele frequencies
+     */
    protected void printAlleleFrequencyData(char ref, GenomeLoc loc, int frequencyEstimationPoints) {
        for (int i = 0; i < frequencyEstimationPoints; i++) {
            StringBuilder AFline = new StringBuilder("AFINFO\t");
@ -256,21 +297,8 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
    }

    protected Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
-        // first, find the alt allele with maximum confidence
-        int indexOfMax = 0;
-        char baseOfMax = ref;
-        double maxConfidence = Double.MIN_VALUE;
-        for ( char altAllele : BaseUtils.BASES ) {
-            if ( altAllele != ref ) {
-                int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
-                if ( PofFs[baseIndex] > maxConfidence ||
-                     (MathUtils.compareDoubles(PofFs[baseIndex], maxConfidence) == 0 && log10PofDgivenAFi[baseIndex][0] < log10PofDgivenAFi[indexOfMax][0]) ) {
-                    indexOfMax = baseIndex;
-                    baseOfMax = altAllele;
-                    maxConfidence = PofFs[baseIndex];
-                }
-            }
-        }
+        // only need to look at the most likely alternate allele
+        int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);

        double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
        if ( Double.isInfinite(phredScaledConfidence) )
@ -282,7 +310,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
            return new Pair<List<Genotype>, GenotypeLocusData>(null, null);

        // populate the sample-specific data
-        List<Genotype> calls = makeGenotypeCalls(ref, baseOfMax, contexts, loc);
+        List<Genotype> calls = makeGenotypeCalls(ref, bestAlternateAllele, contexts, loc);

        // next, the general locus data
        // note that calculating strand bias involves overwriting data structures, so we do that last
@ -292,7 +320,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
                ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
            }
            if ( locusdata instanceof AlternateAlleleBacked ) {
-                ((AlternateAlleleBacked)locusdata).setAlternateAllele(baseOfMax);
+                ((AlternateAlleleBacked)locusdata).setAlternateAllele(bestAlternateAllele);
            }
            if ( locusdata instanceof AlleleFrequencyBacked ) {
                ((AlleleFrequencyBacked)locusdata).setAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));
--- a/java/src/org/broadinstitute/sting/utils/genotype/DiploidGenotype.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/DiploidGenotype.java
@ -59,7 +59,14 @@ public enum DiploidGenotype {
     * @return the diploid genotype
     */
    public static DiploidGenotype createHomGenotype(char hom) {
-        return DiploidGenotype.valueOf((String.valueOf(hom) + String.valueOf(hom)).toUpperCase());
+        hom = Character.toUpperCase(hom);
+        switch (hom) {
+            case 'A': return DiploidGenotype.AA;
+            case 'C': return DiploidGenotype.CC;
+            case 'G': return DiploidGenotype.GG;
+            case 'T': return DiploidGenotype.TT;
+        }
+        throw new IllegalArgumentException(hom + " is not a valid base character");
    }

    /**
--- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@ -89,7 +89,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    public void testMultiSamplePilot2Joint() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
-                Arrays.asList("cec6081d89ada9b2924fe38c7a021921"));
+                Arrays.asList("7eb10aec21497da727eeb67bb5c5a743"));
        executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
    }