Better organization of Genotype likelihood calculations. NewHotness is now just GenotypeLikelihoods. There are 1, 3, and empirical base error models available as subclasses, along with a simple way to make this (see the factory).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1481 348d0f76-0448-11de-a6fe-93d51630548a
2009-08-30 19:16:30 +00:00 · 2009-08-30 19:16:30 +00:00 · 49a7babb2c
parent 522e4a77ae
commit 49a7babb2c
10 changed files with 625 additions and 74 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java
@ -0,0 +1,7 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+public enum BaseMismatchModel {
+    ONE_STATE,
+    THREE_STATE,
+    EMPIRICAL
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java
@ -3,21 +3,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
 import org.broadinstitute.sting.utils.BaseUtils;

 import static java.lang.Math.log10;
-import static java.lang.Math.pow;
 import java.util.TreeMap;
 import java.util.EnumMap;

 import net.sf.samtools.SAMRecord;
 import net.sf.samtools.SAMReadGroupRecord;

-/**
- * Created by IntelliJ IDEA.
- * User: mdepristo
- * Date: Aug 23, 2009
- * Time: 9:47:33 PM
- * To change this template use File | Settings | File Templates.
- */
-public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotypeLikelihoods {
+public class EmpiricalSubstitutionGenotypeLikelihoods extends GenotypeLikelihoods {
    // --------------------------------------------------------------------------------------------------------------
    //
    // Static methods to manipulate machine platforms
@ -120,27 +112,6 @@ public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotype
        addMisCall(pl, 'T', 'C', 22.1/100.0);
        addMisCall(pl, 'T', 'G', 32.0/100.0);
    }
-//
-//    // TODO -- delete me for testing only
-//    private static void addSolexa() {
-//        SequencerPlatform pl = SequencerPlatform.SOLEXA;
-//        addMisCall(pl, 'A', 'C', 59.2/100.0);
-//        addMisCall(pl, 'A', 'G', 15.3/100.0);
-//        addMisCall(pl, 'A', 'T', 25.6/100.0);
-//
-//        addMisCall(pl, 'C', 'A', 54.2/100.0);
-//        addMisCall(pl, 'C', 'G', 10.3/100.0);
-//        addMisCall(pl, 'C', 'T', 35.5/100.0);
-//
-//        addMisCall(pl, 'G', 'A', 26.4/100.0);
-//        addMisCall(pl, 'G', 'C',  5.6/100.0);
-//        addMisCall(pl, 'G', 'T', 68.0/100.0);
-//
-//        addMisCall(pl, 'T', 'A', 41.8/100.0);
-//        addMisCall(pl, 'T', 'C', 17.3/100.0);
-//        addMisCall(pl, 'T', 'G', 40.9/100.0);
-//
-//    }

    private static void addSOLiD() {
        SequencerPlatform pl = SequencerPlatform.SOLID;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java
@ -1,18 +1,490 @@
 package org.broadinstitute.sting.gatk.walkers.genotyper;

-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.MathUtils;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.utils.*;

 import static java.lang.Math.log10;
 import static java.lang.Math.pow;

-public abstract class GenotypeLikelihoods {
-    // precalculate these for performance (pow/log10 is expensive!)
+/**
+ * Stable, error checking version of the Bayesian genotyper.  Useful for calculating the likelihoods, priors,
+ * and posteriors given a pile of bases and quality scores
+ *
+ * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN.  This object
+ * calculates:
+ *
+ * P(G | D) = P(G) * P(D | G)
+ *
+ * where
+ *
+ * P(D | G) = sum_i log10 P(bi | G)
+ *
+ * and
+ *
+ * P(bi | G) = 1 - P(error | q1) if bi is in G
+ *           = P(error | q1) / 3 if bi is not in G
+ *
+ * for homozygous genotypes and for heterozygous genotypes:
+ *
+ * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
+ *           = P(error | q1) / 3 if bi is not in G
+ *
+ * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
+ *
+ * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
+ *
+ * The priors contain the relative probabilities of each genotype, and must be provided at object creation.
+ * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
+ * model.
+ */
+public abstract class GenotypeLikelihoods implements Cloneable {
+    private final static int FIXED_PLOIDY = 2;
+    private final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
+
+    //
+    // The three fundamental data arrays associated with a Genotype Likelhoods object
+    //
+    protected double[] likelihoods = null;
+    protected double[] posteriors = null;
+
+    private DiploidGenotypePriors priors = null;

    /**
-     * SOLID data uses Q0 bases to represent reference-fixed bases -- they shouldn't be counted
-     * during GL calculations.  If this field is true, Q0 bases will be removed in add().
+     * If true, lots of output will be generated about the likelihoods of individual genotypes at each site
     */
-    protected boolean filterQ0Bases = true;
-    //public abstract int add(char ref, char read, byte qual);
-}
+    private boolean verbose = false;
+
+    /**
+     * Bases with Q scores below this threshold aren't included in the likelihood calculation
+     */
+    private int minQScoreToInclude = 0;
+
+    /**
+     * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
+     */
+    public GenotypeLikelihoods() {
+        this.priors = new DiploidGenotypePriors();
+        initialize();
+    }
+
+    /**
+     * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
+     */
+    public GenotypeLikelihoods(DiploidGenotypePriors priors) {
+        this.priors = priors;
+        initialize();
+    }
+
+    /**
+     * Cloning of the object
+     * @return
+     * @throws CloneNotSupportedException
+     */
+    protected Object clone() throws CloneNotSupportedException {
+        GenotypeLikelihoods c = (GenotypeLikelihoods)super.clone();
+        c.priors = priors;
+        c.likelihoods = likelihoods.clone();
+        c.posteriors = posteriors.clone();
+        return c;
+    }
+
+    private void initialize() {
+        likelihoods = zeros.clone();            // likelihoods are all zeros
+        posteriors = priors.getPriors().clone();            // posteriors are all the priors
+    }
+
+
+    public void setVerbose(boolean v) {
+        verbose = v;
+    }
+
+    public boolean isVerbose() {
+        return verbose;
+    }
+
+    public int getMinQScoreToInclude() {
+        return minQScoreToInclude;
+    }
+
+    public void setMinQScoreToInclude(int minQScoreToInclude) {
+        this.minQScoreToInclude = minQScoreToInclude;
+    }
+
+    /**
+     * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
+     * @return
+     */
+    public double[] getLikelihoods() {
+        return likelihoods;
+    }
+
+    /**
+     * Returns the likelihood associated with DiploidGenotype g
+     * @param g
+     * @return log10 likelihood as a double
+     */
+    public double getLikelihood(DiploidGenotype g) {
+        return getLikelihoods()[g.ordinal()];
+    }
+
+    /**
+     * Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
+     *
+     * @return raw log10 (not-normalized posteriors) as a double array
+     */
+    public double[] getPosteriors() {
+        return posteriors;
+    }
+
+    /**
+     * Returns the posterior associated with DiploidGenotype g
+     * @param g
+     * @return raw log10 (not-normalized posteror) as a double
+     */
+    public double getPosterior(DiploidGenotype g) {
+        return getPosteriors()[g.ordinal()];
+    }
+
+    public DiploidGenotypePriors getPriorObject() {
+        return priors;
+    }
+
+    /**
+     * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
+     *
+     * @return log10 prior as a double array
+     */
+    public double[] getPriors() {
+        return priors.getPriors();
+    }
+
+    /**
+     * Returns the prior associated with DiploidGenotype g
+     * @param g
+     * @return log10 prior as a double
+     */
+    public double getPrior(DiploidGenotype g) {
+        return getPriors()[g.ordinal()];
+    }
+
+    // -----------------------------------------------------------------------------------------------------------------
+    //
+    //
+    // add() -- the heart of 
+    //
+    //
+    // -----------------------------------------------------------------------------------------------------------------
+
+    /**
+     * Updates likelihoods and posteriors to reflect an additional observation of observedBase with
+     * qualityScore.
+     *
+     * @param observedBase
+     * @param qualityScore
+     * @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example)
+     */
+    public int add(char observedBase, byte qualityScore, SAMRecord read, int offset) {
+        return reallyAdd(observedBase, qualityScore, read, offset, true);
+    }
+
+    /**
+     * Updates likelihoods and posteriors to reflect the additional observations contained within the
+     * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
+     * pileup
+     *
+     * @param pileup
+     * @return the number of good bases found in the pileup
+     */
+    public int add(ReadBackedPileup pileup, boolean ignoreBadBases) {
+        int n = 0;
+
+        for (int i = 0; i < pileup.getReads().size(); i++) {
+            SAMRecord read = pileup.getReads().get(i);
+            int offset = pileup.getOffsets().get(i);
+            char base = read.getReadString().charAt(offset);
+            byte qual = read.getBaseQualities()[offset];
+            if ( ! ignoreBadBases || ! badBase(base) ) {
+                n += add(base, qual, read, offset);
+            }
+        }
+
+        return n;
+    }
+
+    public int add(ReadBackedPileup pileup) {
+        return add(pileup, false);
+    }
+
+
+    private int reallyAdd(char observedBase, byte qualityScore, SAMRecord read, int offset, boolean enableCache) {
+        if ( badBase(observedBase) ) {
+            throw new RuntimeException(String.format("BUG: unexpected base %c with Q%d passed to GenotypeLikelihoods", observedBase, qualityScore));
+        }
+
+        int nBasesAdded = 0; // the result -- how many bases did we add?
+
+        if ( qualityScore > getMinQScoreToInclude() ) {
+            // Handle caching if requested.  Just look up the cached result if its available, or compute and store it
+            GenotypeLikelihoods cached = null;
+            if ( enableCache ) {
+                if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read, offset ) ) {
+                    cached = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset );
+                } else {
+                    cached = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset );
+                }
+            }
+
+            for ( DiploidGenotype g : DiploidGenotype.values() ) {
+                double likelihoodCalc = ! enableCache ? log10PofObservingBaseGivenGenotype(observedBase, g, qualityScore, read, offset) : 0.0;
+                double likelihoodCached = enableCache ? cached.likelihoods[g.ordinal()] : 0.0;
+                double likelihood = enableCache ? likelihoodCached : likelihoodCalc;
+
+                //if ( enableCache && likelihoodCalc != 0.0 && MathUtils.compareDoubles(likelihoodCached, likelihoodCalc) != 0 ) {
+                //    System.out.printf("ERROR: Likelihoods not equal is CACHE=%f != calc=%f for %c %d %s%n",
+                //            likelihoodCached, likelihoodCalc, observedBase, qualityScore, g.toString());
+                //}
+
+                if ( isVerbose() ) {
+                    boolean fwdStrand = ! read.getReadNegativeStrandFlag();
+                    System.out.printf("  L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
+                            observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
+                }
+
+                likelihoods[g.ordinal()] += likelihood;
+                posteriors[g.ordinal()] += likelihood;
+            }
+
+            if ( isVerbose() ) {
+                for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
+                System.out.println();
+                for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", likelihoods[g.ordinal()]); }
+                System.out.println();
+            }
+
+            nBasesAdded = 1;
+        }
+
+        return nBasesAdded;
+    }
+
+    // -----------------------------------------------------------------------------------------------------------------
+    //
+    //
+    // caching routines
+    //
+    //
+    // -----------------------------------------------------------------------------------------------------------------
+    final static GenotypeLikelihoods[][][][][] CACHE =
+            new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
+    static int cacheSize = 0;
+
+    private GenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy,
+                                                       SAMRecord read, int offset, GenotypeLikelihoods val ) {
+
+        int a = EmpiricalSubstitutionGenotypeLikelihoods.getReadSequencerPlatform(read).ordinal();
+        int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
+        int j = qualityScore;
+        int k = ploidy;
+        int x = strandIndex(! read.getReadNegativeStrandFlag());
+
+        if ( val != null )
+            CACHE[a][i][j][k][x] = val;
+
+        return CACHE[a][i][j][k][x];
+    }
+
+    private GenotypeLikelihoods getCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
+        return getSetCache( observedBase, qualityScore, ploidy, read, offset, null );
+    }
+
+    private void setCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset, GenotypeLikelihoods val ) {
+        getSetCache( observedBase, qualityScore, ploidy, read, offset, val );
+    }
+
+    private int strandIndex(boolean fwdStrand) {
+        return fwdStrand ? 0 : 1;
+    }
+
+    private boolean inCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
+        return getCache(observedBase, qualityScore, ploidy, read, offset) != null;
+    }
+
+    private GenotypeLikelihoods getCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
+        if ( ! inCache(observedBase, qualityScore, ploidy, read, offset ) )
+            throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
+                    observedBase, qualityScore, ploidy, read));
+
+        return getCache(observedBase, qualityScore, ploidy, read, offset);
+    }
+
+    private GenotypeLikelihoods calculateCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
+        if ( inCache(observedBase, qualityScore, ploidy, read, offset ) )
+            throw new RuntimeException(String.format("BUG: trying to set an already set cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
+                    observedBase, qualityScore, ploidy, read));
+
+        // create a new genotype likelihoods object and add this single base to it -- now we have a cached result
+        try {
+            GenotypeLikelihoods g = (GenotypeLikelihoods)this.clone();
+            g.initialize();
+            g.reallyAdd(observedBase, qualityScore, read, offset, false);
+
+            setCache(observedBase, qualityScore, ploidy, read, offset, g);
+            cacheSize++;
+
+            //System.out.printf("Caching %c %d %d %s %s (%d total entries)%n", observedBase, qualityScore, ploidy, read.getReadName(), EmpiricalSubstitutionGenotypeLikelihoods.getReadSequencerPlatform(read), cacheSize);
+            return g;
+        } catch ( CloneNotSupportedException e ) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    // -----------------------------------------------------------------------------------------------------------------
+    //
+    //
+    // helper routines
+    //
+    //
+    // -----------------------------------------------------------------------------------------------------------------
+
+    /**
+     * Returns true when the observedBase is considered bad and shouldn't be processed by this object.  A base
+     * is considered bad if:
+     *
+     *   Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
+     *
+     * @param observedBase
+     * @return true if the base is a bad base
+     */
+    private boolean badBase(char observedBase) {
+        return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
+    }
+
+    /**
+     * Return a string representation of this object in a moderately usable form
+     *
+     * @return
+     */
+    public String toString() {
+        double sum = 0;
+        StringBuilder s = new StringBuilder();
+        for (DiploidGenotype g : DiploidGenotype.values()) {
+            s.append(String.format("%s %.10f ", g, likelihoods[g.ordinal()]));
+			sum += Math.pow(10,likelihoods[g.ordinal()]);
+        }
+		s.append(String.format(" %f", sum));
+        return s.toString();
+    }
+
+    // -----------------------------------------------------------------------------------------------------------------
+    //
+    //
+    // Validation routines
+    //
+    //
+    // -----------------------------------------------------------------------------------------------------------------
+
+    public boolean validate() {
+        return validate(true);
+    }
+
+    public boolean validate(boolean throwException) {
+        try {
+            priors.validate(throwException);
+
+            for ( DiploidGenotype g : DiploidGenotype.values() ) {
+                String bad = null;
+
+                int i = g.ordinal();
+                if ( ! MathUtils.wellFormedDouble(likelihoods[i]) || ! MathUtils.isNegativeOrZero(likelihoods[i]) ) {
+                    bad = String.format("Likelihood %f is badly formed", likelihoods[i]);
+                } else if ( ! MathUtils.wellFormedDouble(posteriors[i]) || ! MathUtils.isNegativeOrZero(posteriors[i]) ) {
+                    bad = String.format("Posterior %f is badly formed", posteriors[i]);
+                }
+
+                if ( bad != null ) {
+                    throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
+                }
+            }
+        } catch ( IllegalStateException e ) {
+            if ( throwException )
+                throw new RuntimeException(e);
+            else
+                return false;
+        }
+
+        return true;
+    }
+
+    // -----------------------------------------------------------------------------------------------------------------
+    //
+    //
+    // Hearty math calculations follow
+    //
+    //    -- these should not be messed with unless you know what you are doing
+    //
+    // -----------------------------------------------------------------------------------------------------------------
+
+    /**
+     * Works for any ploidy (in organization).
+     *
+     * @param observedBase
+     * @param g
+     * @param qual
+     * @return
+     */
+    protected double log10PofObservingBaseGivenGenotype(char observedBase, DiploidGenotype g, byte qual, SAMRecord read, int offset ) {
+        if (qual == 0) { // zero quals are wrong
+            throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %s %d at %d in %s",
+                    observedBase, g, qual, offset, read));
+        }
+
+        // todo assumes ploidy is 2 -- should be generalized.  Obviously the below code can be turned into a loop
+        double p_base = 0.0;
+        p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base1, qual, FIXED_PLOIDY, read, offset ));
+        p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base2, qual, FIXED_PLOIDY, read, offset ));
+        return log10(p_base);
+    }
+
+    protected double log10PofObservingBaseGivenChromosome(char observedBase, char chromBase, byte qual, int ploidy, SAMRecord read, int offset) {
+        double logP = 0.0;
+
+        if ( observedBase == chromBase ) {
+            // the base is consistent with the chromosome -- it's 1 - e
+            //logP = oneMinusData[qual];
+            double e = pow(10, (qual / -10.0));
+            logP = log10(1.0 - e);
+        } else {
+            // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
+            logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset);
+        }
+
+        // adjust for ploidy.  We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
+        //logP -= log10N[ploidy];
+        logP -= log10(ploidy);
+
+        //System.out.printf("%c %c %d %d => %f%n", observedBase, chromBase, qual, ploidy, logP);
+        return logP;
+    }
+
+    /**
+     * Must be overridden by concrete subclasses
+     * 
+     * @param observedBase
+     * @param chromBase
+     * @param read
+     * @param offset
+     * @return
+     */
+    protected abstract double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset);
+
+    //
+    // Constant static data
+    //
+    private final static double[] zeros = new double[DiploidGenotype.values().length];
+
+    static {
+        for ( DiploidGenotype g : DiploidGenotype.values() ) {
+            zeros[g.ordinal()] = 0.0;
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsFactory.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsFactory.java
@ -0,0 +1,34 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import static org.broadinstitute.sting.gatk.walkers.genotyper.BaseMismatchModel.*;
+
+public class GenotypeLikelihoodsFactory {
+    //private GenotypeLikelihoodsFactory() {} // cannot be instantiated
+
+    public static BaseMismatchModel getBaseMismatchModel(final String name) {
+        BaseMismatchModel m = valueOf(name);
+        if ( m == null )
+            throw new RuntimeException("Unexpected BaseMismatchModel " + name);
+        else
+            return m;
+    }
+
+    /**
+     * General and correct way to create GenotypeLikelihood objects for arbitrary base mismatching models
+     *
+     * @param m
+     * @param priors
+     * @param pl
+     * @return
+     */
+    public static GenotypeLikelihoods makeGenotypeLikelihoods(BaseMismatchModel m,
+                                                              DiploidGenotypePriors priors,
+                                                              EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform pl ) {
+        switch ( m ) {
+            case ONE_STATE: return new OneStateErrorGenotypeLikelihoods(priors);
+            case THREE_STATE: return new ThreeStateErrorGenotypeLikelihoods(priors);
+            case EMPIRICAL: return new EmpiricalSubstitutionGenotypeLikelihoods(priors, pl);
+            default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OldAndBustedGenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OldAndBustedGenotypeLikelihoods.java
@ -4,7 +4,6 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.utils.ReadBackedPileup;
 import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.genotype.BasicGenotype;
 import org.broadinstitute.sting.utils.genotype.Genotype;
 import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore;
@ -15,12 +14,13 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;

-public class OldAndBustedGenotypeLikelihoods extends GenotypeLikelihoods {
+public class OldAndBustedGenotypeLikelihoods {
    protected static final double[] oneMinusData = new double[Byte.MAX_VALUE];
    protected static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE];
    protected static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE];
    //protected static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE];
    protected static final double log10Of1_3 = log10(1.0 / 3.0);
+    private boolean filterQ0Bases = true;

    static {
        for (int qual = 0; qual < Byte.MAX_VALUE; qual++) {
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OneStateErrorGenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OneStateErrorGenotypeLikelihoods.java
@ -0,0 +1,43 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.utils.BaseUtils;
+
+import static java.lang.Math.log10;
+import java.util.TreeMap;
+import java.util.EnumMap;
+
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMReadGroupRecord;
+
+/**
+ * This implements the old style CRD calculation of the chance that a base being a true chromBase given
+ * an miscalled base, in which the p is e, grabbing all of the probability.  It shouldn't be used
+ */
+public class OneStateErrorGenotypeLikelihoods extends GenotypeLikelihoods {
+    //
+    // forwarding constructors -- don't do anything at all
+    //
+    public OneStateErrorGenotypeLikelihoods() { super(); }
+    public OneStateErrorGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); }
+
+    /**
+     * Cloning of the object
+     * @return
+     * @throws CloneNotSupportedException
+     */
+    protected Object clone() throws CloneNotSupportedException {
+        return super.clone();
+    }
+
+    /**
+     *
+     * @param observedBase
+     * @param chromBase
+     * @param read
+     * @param offset
+     * @return
+     */
+    protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) {
+        return 0; // equivalent to e model
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java
@ -40,8 +40,8 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
    @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
    public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY;

-    @Argument(fullName = "useEmpiricalTransitions", shortName = "useEmpiricalTransitions", doc = "EXPERIMENTAL", required = false)
-    public boolean useEmpiricalTransitions = false;
+    @Argument(fullName = "baseModel", shortName = "m", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
+    public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;

    @Argument(fullName = "verbose", shortName = "v", doc = "EXPERIMENTAL", required = false)
    public boolean VERBOSE = false;
@ -79,25 +79,21 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
        DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, heterozygosity, DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);

        // setup GenotypeLike object
-        NewHotnessGenotypeLikelihoods G;
-        if ( useEmpiricalTransitions )
-            G = new EmpiricalSubstitutionGenotypeLikelihoods(priors, defaultPlatform);
-        else
-            G = new NewHotnessGenotypeLikelihoods(priors);
+        GenotypeLikelihoods gl = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);

-        G.setVerbose(VERBOSE);
+        gl.setVerbose(VERBOSE);
        ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
-        G.add(pileup, true);
-        G.validate();
+        gl.add(pileup, true);
+        gl.validate();

        // lets setup the locus
        List<Genotype> lst = new ArrayList<Genotype>();
        for ( DiploidGenotype g : DiploidGenotype.values() ) {
-            lst.add(new BasicGenotype(pileup.getLocation(), g.toString(),new BayesianConfidenceScore(G.getLikelihood(g))));
+            lst.add(new BasicGenotype(pileup.getLocation(), g.toString(),new BayesianConfidenceScore(gl.getLikelihood(g))));
        }

-        //System.out.printf("At %s%n", new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, G.getPosteriors(), pileup));
-        return new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, G.getPosteriors(), pileup);
+        //System.out.printf("At %s%n", new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, g.getPosteriors(), pileup));
+        return new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, gl.getPosteriors(), pileup);
    }

    /**
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorGenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorGenotypeLikelihoods.java
@ -0,0 +1,36 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.utils.BaseUtils;
+
+import static java.lang.Math.log10;
+import java.util.TreeMap;
+import java.util.EnumMap;
+
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMReadGroupRecord;
+
+public class ThreeStateErrorGenotypeLikelihoods extends GenotypeLikelihoods {
+    //
+    // forwarding constructors -- don't do anything at all
+    //
+    public ThreeStateErrorGenotypeLikelihoods() { super(); }
+    public ThreeStateErrorGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); }
+
+    /**
+     * Cloning of the object
+     * @return
+     * @throws CloneNotSupportedException
+     */
+    protected Object clone() throws CloneNotSupportedException {
+        return super.clone();
+    }
+
+    /**
+     * Simple log10(3) cached value
+     */
+    protected static final double log103 = log10(3.0);
+
+    protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) {
+        return -log103; // equivalent to e / 3 model
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java
@ -189,11 +189,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
            }

            //Calculate posterior probabilities!
-            NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods();
+            GenotypeLikelihoods gl = new ThreeStateErrorGenotypeLikelihoods();

-            //I've tried simply adding the entire pileup to G, but quality scores are not checked, and 'N' bases throw an error
-            //(NewHotnessGenotypeLikelihoods potentially has bug on line 57)
-            //G.add(pileup);
+            //I've tried simply adding the entire pileup to g, but quality scores are not checked, and 'N' bases throw an error
+            //(GenotypeLikelihoods potentially has bug on line 57)
+            //g.add(pileup);

            //Check for bad bases and ensure mapping quality myself. This works.
            for (int i = 0; i < context.getReads().size(); i++) {
@ -206,20 +206,20 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
                    if (base == 'A'){numAs++;}
                    if (base == 'C'){numCs++;}
                    if (base == 'T'){numTs++;}
-                    if (base == 'G'){numGs++;}
+                    if (base == 'g'){numGs++;}
                    //consider base in likelihood calculations if it looks good and has high mapping score
-                    G.add(base, qual, read, offset);
+                    gl.add(base, qual, read, offset);
                }
            }

            //Debugging purposes
-            if (DEBUG) {out.printf("A[%s]\tC[%s]\tT[%s]\tG[%s]\t",numAs,numCs,numTs,numGs);}
+            if (DEBUG) {out.printf("A[%s]\tC[%s]\tT[%s]\tg[%s]\t",numAs,numCs,numTs,numGs);}


            //Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype
            Scores = new Hashtable();
            for ( DiploidGenotype g : DiploidGenotype.values() ) {
-                Scores.put(g.toString(), G.getLikelihood(g));
+                Scores.put(g.toString(), gl.getLikelihood(g));
            }

            //Get likelihood score for homozygous ref: used to normalize likelihoood scores at 0.
--- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java
@ -1,15 +1,7 @@
 package org.broadinstitute.sting.gatk.walkers.genotyper;

-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.ReadBackedPileup;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.BaseTest;
 import org.junit.Ignore;
-import edu.mit.broad.picard.util.MathUtil;
-
-import java.util.Arrays;

 /**
 * Stable, error checking version of the Bayesian genotyper.  Useful for calculating the likelihoods, priors,
@ -50,10 +42,10 @@ public class NewHotnessGenotypeLikelihoodsTest extends BaseTest {
    private double[] priors = null;
    private double[] posteriors = null;

-    NewHotnessGenotypeLikelihoods();
-    NewHotnessGenotypeLikelihoods(char ref, double heterozygosity)
-    NewHotnessGenotypeLikelihoods(char ref, double priorHomRef, double priorHet, double priorHomVar);
-    NewHotnessGenotypeLikelihoods(double[] log10Priors)
+    GenotypeLikelihoods();
+    GenotypeLikelihoods(char ref, double heterozygosity)
+    GenotypeLikelihoods(char ref, double priorHomRef, double priorHet, double priorHomVar);
+    GenotypeLikelihoods(double[] log10Priors)
    double[] getLikelihoods()
    double getLikelihood(DiploidGenotype g);
    double[] getPosteriors();