I had to remove some of my more agressive optimizations, as they were causing us to get slightly different results as MSG. Results in only small cost to running time.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1856 348d0f76-0448-11de-a6fe-93d51630548a
2009-10-16 00:59:32 +00:00 · 2009-10-16 00:59:32 +00:00 · 993c567bd8
parent 7d7ff09f54
commit 993c567bd8
3 changed files with 23 additions and 46 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java
@ -11,7 +11,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
    protected AllMAFsGenotypeCalculationModel() {}
-    protected double[] initializeAlleleFrequencies(int numSamples,  int[] baseCounts) {
+    protected double[] initializeAlleleFrequencies(int numSamples, char ref) {
        double[] alleleFrequencies = new double[2 * numSamples + 1];   // 2N + 1 possible minor alleles
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
@ -11,10 +11,10 @@ import java.util.*;
 public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel {
    // We need to set a limit on the EM iterations in case something flukey goes on
-    protected static final int MAX_EM_ITERATIONS = 6;
+    protected static final int MAX_EM_ITERATIONS = 8;
-    // We consider the EM stable when the MAF doesn't change more than 1/10N
+    // We consider the EM stable when the MAF doesn't change more than 1/1000
-    protected static final double EM_STABILITY_METRIC = 0.1;
+    protected static final double EM_STABILITY_METRIC = 1e-3;
    protected EMGenotypeCalculationModel() {}
@ -22,23 +22,18 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
    public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
        // keep track of the context for each sample, overall and separated by strand
-        int[] baseCounts = new int[4];
+        HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
        HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, baseCounts);
        if ( contexts == null )
            return null;
        // debugging output
        for (int i = 0; i < 4; i++)
            logger.debug("Base count for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i]);
        // run the EM calculation
-        EMOutput overall = runEM(ref, contexts, priors, baseCounts, StratifiedContext.OVERALL);
+        EMOutput overall = runEM(ref, contexts, priors, StratifiedContext.OVERALL);
        double lod = overall.getPofD() - overall.getPofNull();
        logger.debug("lod=" + lod);
        // calculate strand score
-        EMOutput forward = runEM(ref, contexts, priors, baseCounts, StratifiedContext.FORWARD);
+        EMOutput forward = runEM(ref, contexts, priors, StratifiedContext.FORWARD);
-        EMOutput reverse = runEM(ref, contexts, priors, baseCounts, StratifiedContext.REVERSE);
+        EMOutput reverse = runEM(ref, contexts, priors, StratifiedContext.REVERSE);
        double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
        double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
        logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
@ -54,11 +49,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
    protected List<GenotypeCall> genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, HashMap<String, AlignmentContextBySample> contexts) {
        HashMap<String, GenotypeLikelihoods> GLs = results.getGenotypeLikelihoods();
        // an optimization
        double expectedChromosomes = 2.0 * (double)GLs.size() * results.getMAF();
        if ( expectedChromosomes < 1.0 )
            return new ArrayList<GenotypeCall>();
        ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
        int variantCalls = 0;
@ -67,7 +57,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
            AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
            ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
            pileup.setIncludeDeletionsInPileupString(true);
            // create the call
            GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
            calls.add(call);
@ -79,15 +68,15 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
        // if everyone is ref, don't emit any calls
        if ( variantCalls == 0 )
-            calls = null;
+            calls.clear();
        return calls;
    }
-    public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, int[] baseCounts, StratifiedContext contextType) {
+    public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
        // get initial allele frequencies
-        double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), baseCounts);
+        double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), ref);
        for (int i = 0; i < alleleFrequencies.length; i++)
            logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]);
@ -119,7 +108,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
        return computePofF(ref, GLs, alleleFrequencies, contexts.size());
    }
-    protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts);
+    protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, char ref);
    protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType);
    protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs);
    protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies);
@ -127,15 +116,12 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
    protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) {
-        // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N
+        // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
        double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)numSamplesInContext);
        // determine whether we're stable
        double AF_delta = 0.0;
        for (int i = 0; i < oldAlleleFrequencies.length; i++)
            AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]);
-        return (AF_delta < EM_STABILITY_VALUE);  
+        return (AF_delta < EM_STABILITY_METRIC);  
    }
    protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
@ -161,11 +147,9 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
     * Create the mapping from sample to alignment context; also, fill in the base counts along the way.
     * Returns null iff there are too many deletions at this position.
     */
-    protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context, int[] baseCounts) {
+    protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context) {
        HashMap<String, AlignmentContextBySample> contexts = new HashMap<String, AlignmentContextBySample>();
        for (int i = 0; i < 4; i++)
            baseCounts[i] = 0;
        int deletionsInPile = 0;
        List<SAMRecord> reads = context.getReads();
@ -197,14 +181,10 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
                // are there too many deletions in the pileup?
                if ( ++deletionsInPile > maxDeletionsInPileup )
                    return null;
            } else {
                // add bad bases to the context (for DoC calculations), but don't count them
                int baseIndex = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset));
                if ( baseIndex != -1 )
                    baseCounts[baseIndex]++;
            }
            // add the read to this sample's context
            // note that bad bases are added to the context (for DoC calculations later)
            myContext.add(read, offset);
        }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
@ -19,7 +19,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
        if ( samples.size() == 1 ) {
            // split the context (so we can get forward/reverse contexts for free)
-            HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, new int[4]);
+            HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
            if ( contexts == null )
                return null;
@ -53,16 +53,13 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
        return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
    }
-    protected double[] initializeAlleleFrequencies(int numSamplesInContext,  int[] baseCounts) {
+    protected double[] initializeAlleleFrequencies(int numSamplesInContext, char ref) {
-        // An intelligent guess would be the observed base ratios
+        // Initialization values from Jared's original MSG code
-        // (plus some small number to account for sampling issues).
+        double NON_REF = 0.0005002502;  // heterozygosity / (2 * sqrt(1-heterozygosity)
-        int totalObservedBases = 0;
+        double REF = 0.9994999; //sqrt(1-heterozygosity)
        for (int i = 0; i < 4; i++)
            totalObservedBases += baseCounts[i];
-        double[] alleleFrequencies = new double[4];
+        double[] alleleFrequencies = {NON_REF, NON_REF, NON_REF, NON_REF};
-        for (int i = 0; i < 4; i++)
+        alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF;
            alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
        return alleleFrequencies;
    }