From 993c567bd88cfe8bce589a8462fc349fa99c1938 Mon Sep 17 00:00:00 2001
From: ebanks <ebanks@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 16 Oct 2009 00:59:32 +0000
Subject: [PATCH] I had to remove some of my more agressive optimizations, as
 they were causing us to get slightly different results as MSG.  Results in
 only small cost to running time.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1856 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../AllMAFsGenotypeCalculationModel.java      |  2 +-
 .../genotyper/EMGenotypeCalculationModel.java | 50 ++++++-------------
 ...PointEstimateGenotypeCalculationModel.java | 17 +++----
 3 files changed, 23 insertions(+), 46 deletions(-)
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java
index 3a9f943e2..5ee1962e1 100644
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java
@@ -11,7 +11,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
 
     protected AllMAFsGenotypeCalculationModel() {}
 
-    protected double[] initializeAlleleFrequencies(int numSamples,  int[] baseCounts) {
+    protected double[] initializeAlleleFrequencies(int numSamples, char ref) {
         double[] alleleFrequencies = new double[2 * numSamples + 1];   // 2N + 1 possible minor alleles
 
 
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
index 9d22fc3e7..d036425ef 100755
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
@@ -11,10 +11,10 @@ import java.util.*;
 public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel {
 
     // We need to set a limit on the EM iterations in case something flukey goes on
-    protected static final int MAX_EM_ITERATIONS = 6;
+    protected static final int MAX_EM_ITERATIONS = 8;
 
-    // We consider the EM stable when the MAF doesn't change more than 1/10N
-    protected static final double EM_STABILITY_METRIC = 0.1;
+    // We consider the EM stable when the MAF doesn't change more than 1/1000
+    protected static final double EM_STABILITY_METRIC = 1e-3;
 
 
     protected EMGenotypeCalculationModel() {}
@@ -22,23 +22,18 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
     public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
 
         // keep track of the context for each sample, overall and separated by strand
-        int[] baseCounts = new int[4];
-        HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, baseCounts);
+        HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
         if ( contexts == null )
             return null;
 
-        // debugging output
-        for (int i = 0; i < 4; i++)
-            logger.debug("Base count for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i]);
-
         // run the EM calculation
-        EMOutput overall = runEM(ref, contexts, priors, baseCounts, StratifiedContext.OVERALL);
+        EMOutput overall = runEM(ref, contexts, priors, StratifiedContext.OVERALL);
         double lod = overall.getPofD() - overall.getPofNull();
         logger.debug("lod=" + lod);
 
         // calculate strand score
-        EMOutput forward = runEM(ref, contexts, priors, baseCounts, StratifiedContext.FORWARD);
-        EMOutput reverse = runEM(ref, contexts, priors, baseCounts, StratifiedContext.REVERSE);
+        EMOutput forward = runEM(ref, contexts, priors, StratifiedContext.FORWARD);
+        EMOutput reverse = runEM(ref, contexts, priors, StratifiedContext.REVERSE);
         double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
         double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
         logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
@@ -54,11 +49,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
     protected List<GenotypeCall> genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, HashMap<String, AlignmentContextBySample> contexts) {
         HashMap<String, GenotypeLikelihoods> GLs = results.getGenotypeLikelihoods();
 
-        // an optimization
-        double expectedChromosomes = 2.0 * (double)GLs.size() * results.getMAF();
-        if ( expectedChromosomes < 1.0 )
-            return new ArrayList<GenotypeCall>();
-
         ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
         int variantCalls = 0;
 
@@ -67,7 +57,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
             AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
             ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
             pileup.setIncludeDeletionsInPileupString(true);
-
             // create the call
             GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
             calls.add(call);
@@ -79,15 +68,15 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
 
         // if everyone is ref, don't emit any calls
         if ( variantCalls == 0 )
-            calls = null;
+            calls.clear();
 
         return calls;
     }
 
-    public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, int[] baseCounts, StratifiedContext contextType) {
+    public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
 
         // get initial allele frequencies
-        double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), baseCounts);
+        double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), ref);
         for (int i = 0; i < alleleFrequencies.length; i++)
             logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]);
 
@@ -119,7 +108,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
         return computePofF(ref, GLs, alleleFrequencies, contexts.size());
     }
 
-    protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts);
+    protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, char ref);
     protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType);
     protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs);
     protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies);
@@ -127,15 +116,12 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
 
 
     protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) {
-        // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N
-        double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)numSamplesInContext);
-
-        // determine whether we're stable
+        // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
         double AF_delta = 0.0;
         for (int i = 0; i < oldAlleleFrequencies.length; i++)
             AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]);
 
-        return (AF_delta < EM_STABILITY_VALUE);  
+        return (AF_delta < EM_STABILITY_METRIC);  
     }
 
     protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
@@ -161,11 +147,9 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
      * Create the mapping from sample to alignment context; also, fill in the base counts along the way.
      * Returns null iff there are too many deletions at this position.
      */
-    protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context, int[] baseCounts) {
+    protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context) {
 
         HashMap<String, AlignmentContextBySample> contexts = new HashMap<String, AlignmentContextBySample>();
-        for (int i = 0; i < 4; i++)
-            baseCounts[i] = 0;
 
         int deletionsInPile = 0;
         List<SAMRecord> reads = context.getReads();
@@ -197,14 +181,10 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
                 // are there too many deletions in the pileup?
                 if ( ++deletionsInPile > maxDeletionsInPileup )
                     return null;
-            } else {
-                // add bad bases to the context (for DoC calculations), but don't count them
-                int baseIndex = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset));
-                if ( baseIndex != -1 )
-                    baseCounts[baseIndex]++;
             }
 
             // add the read to this sample's context
+            // note that bad bases are added to the context (for DoC calculations later)
             myContext.add(read, offset);
         }
 
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
index fe3dee8c2..f92ef57e2 100755
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
@@ -19,7 +19,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
         if ( samples.size() == 1 ) {
 
             // split the context (so we can get forward/reverse contexts for free)
-            HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, new int[4]);
+            HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
             if ( contexts == null )
                 return null;
 
@@ -53,16 +53,13 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
         return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
     }
 
-    protected double[] initializeAlleleFrequencies(int numSamplesInContext,  int[] baseCounts) {
-        // An intelligent guess would be the observed base ratios
-        // (plus some small number to account for sampling issues).
-        int totalObservedBases = 0;
-        for (int i = 0; i < 4; i++)
-            totalObservedBases += baseCounts[i];
+    protected double[] initializeAlleleFrequencies(int numSamplesInContext, char ref) {
+        // Initialization values from Jared's original MSG code
+        double NON_REF = 0.0005002502;  // heterozygosity / (2 * sqrt(1-heterozygosity)
+        double REF = 0.9994999; //sqrt(1-heterozygosity)
 
-        double[] alleleFrequencies = new double[4];
-        for (int i = 0; i < 4; i++)
-            alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
+        double[] alleleFrequencies = {NON_REF, NON_REF, NON_REF, NON_REF};
+        alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF;
         return alleleFrequencies;
     }