-Refactoring: make GenotypeCalculationModel constructors empty so that they don't have to be updated every time we add a new parameter; instead put that logic in the super class's initialize method (making everything protected so that only the factory can access them)

-Adding initial version of Multi-sample calculation model. This still needs much work: it needs to be cleaned up and finished. Right now, it (purposely) throws a RuntimeException after completing the EM loop. Also: -Fix logic in GenotypeLikelihoods.setPriors -Add logger to the models for output git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1764 348d0f76-0448-11de-a6fe-93d51630548a
2009-10-05 18:10:36 +00:00 · 2009-10-05 18:10:36 +00:00 · fb619bd593
parent 98076db6b4
commit fb619bd593
7 changed files with 309 additions and 56 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
+import org.apache.log4j.Logger;

 import java.util.*;

@ -22,35 +23,45 @@ public abstract class GenotypeCalculationModel implements Cloneable {
    protected Set<String> samples;
    protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
    protected GenotypeWriter out;
+    protected Logger logger;
    protected boolean GENOTYPE_MODE;
    protected double LOD_THRESHOLD;
    protected int maxDeletionsInPileup;
    protected boolean VERBOSE;

    /**
-     * Create a new GenotypeCalculationModel object with given debugging level
+     * Create a new GenotypeCalculationModel object
+     */
+    protected GenotypeCalculationModel() {}
+
+
+    /**
+     * Initialize the GenotypeCalculationModel object
     * Assumes that out is not null
     * @param baseModel     base model to use
     * @param samples       samples in input bam
     * @param platform      default platform
     * @param out           output writer
+     * @param out           logger
     * @param genotypeMode  genotyping
     * @param lod           lod threshold
     * @param maxDeletions  max deletions to tolerate
     * @param verbose       verbose flag
     */
-    public GenotypeCalculationModel(BaseMismatchModel baseModel,
-                                    Set<String> samples,
-                                    EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
-                                    GenotypeWriter out,
-                                    boolean genotypeMode,
-                                    double lod,
-                                    int maxDeletions,
-                                    boolean verbose) {
+    protected void initialize(BaseMismatchModel baseModel,
+                           Set<String> samples,
+                           EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
+                           GenotypeWriter out,
+                           Logger logger,
+                           boolean genotypeMode,
+                           double lod,
+                           int maxDeletions,
+                           boolean verbose) {
        this.baseModel = baseModel;
        this.samples = samples;
        defaultPlatform = platform;
        this.out = out;
+        this.logger = logger;
        GENOTYPE_MODE = genotypeMode;
        LOD_THRESHOLD = lod;
        maxDeletionsInPileup = maxDeletions;
@ -68,6 +79,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
        gcm.samples = new HashSet<String>(samples);
        gcm.defaultPlatform = defaultPlatform;
        gcm.out = out;
+        gcm.logger = logger;
        gcm.GENOTYPE_MODE = GENOTYPE_MODE;
        gcm.LOD_THRESHOLD = LOD_THRESHOLD;
        gcm.maxDeletionsInPileup = maxDeletionsInPileup;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;

 import static org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model.*;
 import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
+import org.apache.log4j.Logger;

 import java.util.Set;

@ -47,17 +48,29 @@ public class GenotypeCalculationModelFactory {
     *
     * @param UAC           The unified argument  collection
     * @param samples       samples in bam
-     * @param out           output
+     * @param out           output writer
+     * @param logger        logger
     * @return model
     */
    public static GenotypeCalculationModel makeGenotypeCalculation(UnifiedArgumentCollection UAC,
                                                                   Set<String> samples,
-                                                                   GenotypeWriter out) {
+                                                                   GenotypeWriter out,
+                                                                   Logger logger) {
+        GenotypeCalculationModel gcm;
        switch ( UAC.genotypeModel ) {
-            case SINGLE_SAMPLE: return new SingleSampleGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE);
-            //case MULTI_SAMPLE_EM: return new MultiSampleEMGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE);
-            case MULTI_SAMPLE_ALL_MAFS: return new MultiSampleAllMAFsGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE);
+            case SINGLE_SAMPLE:
+                gcm = new SingleSampleGenotypeCalculationModel();
+                break;
+            case MULTI_SAMPLE_EM:
+                gcm = new MultiSampleEMGenotypeCalculationModel();
+                break;
+            case MULTI_SAMPLE_ALL_MAFS:
+                gcm = new MultiSampleAllMAFsGenotypeCalculationModel();
+                break;
            default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
        }
+
+        gcm.initialize(UAC.baseModel, samples, UAC.defaultPlatform, out, logger, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE);
+        return gcm;
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java
@ -192,6 +192,7 @@ public abstract class GenotypeLikelihoods implements Cloneable {
     */
    public void setPriors(DiploidGenotypePriors priors) {
        this.priors = priors;
+        posteriors = zeros.clone();
        for ( DiploidGenotype g : DiploidGenotype.values() ) {
            int i = g.ordinal();
            posteriors[i] = priors.getPriors()[i] + likelihoods[i];
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java
@ -3,31 +3,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
 import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.apache.log4j.Logger;

 import java.util.Set;

 public class MultiSampleAllMAFsGenotypeCalculationModel extends GenotypeCalculationModel {

-    public MultiSampleAllMAFsGenotypeCalculationModel(BaseMismatchModel baseModel,
-                                                      Set<String> samples,
-                                                      EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
-                                                      GenotypeWriter out,
-                                                      boolean genotypeMode,
-                                                      double lod,
-                                                      int maxDeletions,
-                                                      boolean verbose) {
-        super(baseModel, samples, platform, out, genotypeMode, lod, maxDeletions, verbose);
-    }
-
-
-    /**
-     * Cloning of the object
-     * @return clone
-     * @throws CloneNotSupportedException
-     */
-    protected Object clone() throws CloneNotSupportedException {
-        return super.clone();
-    }
+    protected MultiSampleAllMAFsGenotypeCalculationModel() {}

    public boolean calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {

--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleEMGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleEMGenotypeCalculationModel.java
@ -0,0 +1,252 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
+import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+
+import java.util.*;
+
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMReadGroupRecord;
+
+public class MultiSampleEMGenotypeCalculationModel extends GenotypeCalculationModel {
+
+    // We need to set a limit on the EM iterations in case something flukey goes on
+    private static final int MAX_EM_ITERATIONS = 6;
+
+    // We consider the EM stable when the MAF doesn't change more than 1/10N
+    private static final double EM_STABILITY_METRIC = 0.1;
+
+
+    protected MultiSampleEMGenotypeCalculationModel() {}
+
+    public boolean calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
+
+        // keep track of the GenotpeLikelihoods for each sample, separated by strand
+        HashMap<String, PerSampleGenotypeLikelihoods> GLs = new HashMap<String, PerSampleGenotypeLikelihoods>();
+
+        // keep track of the total counts of each base in the pileup
+        int[] baseCounts = new int[4];
+        int totalObservedBases = 0;
+
+        // First order of business: create the initial GenotypeLikelihoods objects.
+        // Also, confirm that there aren't too many deletions in the pileup.
+        int deletionsInPile = 0;
+        List<SAMRecord> reads = context.getReads();
+        List<Integer> offsets = context.getOffsets();
+
+        for (int i = 0; i < reads.size(); i++) {
+
+            // set up the base data
+            SAMRecord read = reads.get(i);
+            int offset = offsets.get(i);
+            char base = read.getReadString().charAt(offset);
+
+            // don't use bad bases
+            int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
+            if ( baseIndex == -1 ) {
+                // are there too many deletions in the pileup?
+                if ( ++deletionsInPile > maxDeletionsInPileup )
+                    return false;
+                continue;
+            }
+
+            // create the GL holder object if this is the first time we're seeing a base for this sample
+            String readGroup = read.getAttribute("RG").toString(); // can't be null because those are filtered out
+            String sample = read.getHeader().getReadGroup(readGroup).getSample();
+            PerSampleGenotypeLikelihoods myGLs = GLs.get(sample);
+            if ( myGLs == null ) {
+                myGLs = new PerSampleGenotypeLikelihoods();
+                GLs.put(sample, myGLs);
+            }
+
+            // assign the base to the appropriate strand
+            myGLs.addObservedBase(base, read, offset);
+
+            // update the base counts
+            baseCounts[baseIndex]++;
+            totalObservedBases++;
+        }
+
+        // optimization: if all bases are ref, return
+        if ( baseCounts[BaseUtils.simpleBaseToBaseIndex(ref)] == totalObservedBases )
+            return true;
+
+        // TODO -- Do we quit if there isn't X% of total samples represented in pileup?
+
+        // Next, we need to create initial allele frequencies.
+        // An intelligent guess would be the observed base ratios (plus some small number to account for sampling issues).
+
+        // TODO --- This needs to be broken out into a separate model as we might want to split up the calculation for each minor allele
+
+        double[] alleleFrequencies = new double[4];
+        for (int i = 0; i < 4; i++)
+            alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
+        DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
+
+        for ( PerSampleGenotypeLikelihoods SGL : GLs.values() )
+            SGL.createGLs(AFPriors);
+
+        // debugging output
+        for (int i = 0; i < 4; i++)
+            logger.debug("Base count and initial allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i] + ", " + alleleFrequencies[i]);
+
+        
+        // The EM loop:
+        //   we want to continue until the calculation is stable, but we need some max on the number of iterations
+        int iterations = 0;
+        // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N
+        double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)GLs.size());
+        boolean isStable = false;
+
+        while ( !isStable && iterations < MAX_EM_ITERATIONS ) {
+
+            // calculate the posterior-weighted allele frequencies and modify the priors accordingly
+            double[] newAlleleFrequencies = getPosteriorWeightedFrequencies(GLs.values());
+            AFPriors = calculateAlleleFreqBasedPriors(newAlleleFrequencies);
+            for ( PerSampleGenotypeLikelihoods SGL : GLs.values() )
+                SGL.setPriors(AFPriors);
+
+            // determine whether we're stable
+            double AF_delta = 0.0;
+            for (int i = 0; i < 4; i++) {
+                AF_delta += Math.abs(alleleFrequencies[i] - newAlleleFrequencies[i]);
+                logger.debug("Previous allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i] + ", vs. new frequency: " + newAlleleFrequencies[i]);
+            }
+
+            isStable = AF_delta < EM_STABILITY_VALUE;
+            iterations++;
+
+            alleleFrequencies = newAlleleFrequencies;
+        }
+
+        logger.debug("EM loop took " + iterations + " iterations");
+
+        if (true)
+            throw new RuntimeException("DEBUGGING");
+
+/*
+        ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods;
+        pD = Compute_pD(G, em_result.sample_weights);
+        pNull = Compute_pNull(contexts, em_result.sample_weights);
+
+            // Apply p(f).
+            double pVar = 0.0;
+            for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; }
+
+            double p0 = Math.log10(1 - pVar);
+            double pF;
+
+            double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods);
+
+            if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; }
+            else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); }
+
+            pD = pD + pF;
+            pNull = pNull + p0;
+
+        lod = pD - pNull;
+        return lod;
+
+*/
+        
+        // apply prior of theta / MAF
+        // It turns out that the appropriate prior is theta, for any sample size from 1 to ~1,000
+        //return new SSGenotypeCall(context.getLocation(), ref,gl, pileup);
+
+        return true;
+    }
+
+    double[] getPosteriorWeightedFrequencies(Collection<PerSampleGenotypeLikelihoods> SGLs) {
+        double[] newAlleleLikelihoods = new double[4];
+
+        for ( PerSampleGenotypeLikelihoods SGL : SGLs ) {
+
+            // calculate the posterior weighted frequencies for this sample
+            double[] personalAllelePosteriors = new double[4];
+            double sum = 0;
+            for ( DiploidGenotype g : DiploidGenotype.values() ) {
+                double posterior = Math.pow(10, SGL.totalGL.getPosterior(g));
+                personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base1)] += posterior;
+                personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base2)] += posterior;
+                sum += 2.0 * posterior;
+            }
+
+            // normalize
+            for (int i = 0; i < 4; i++)
+                personalAllelePosteriors[i] /= sum;
+            for (int i = 0; i < 4; i++)
+                newAlleleLikelihoods[i] += personalAllelePosteriors[i];
+        }
+
+        // normalize
+        double sum = 0;
+        for (int i = 0; i < 4; i++)
+            sum += newAlleleLikelihoods[i];
+        for (int i = 0; i < 4; i++)
+            newAlleleLikelihoods[i] /= sum;
+
+        return newAlleleLikelihoods;
+    }
+
+    private DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
+        // convert to log-space
+        double[] log10Freqs = new double[4];
+        for (int i = 0; i < 4; i++)
+            log10Freqs[i] = Math.log10(alleleFrequencies[i]);
+
+        double[] alleleFreqPriors = new double[10];
+
+        // this is the Hardy-Weinberg based allele frequency (p^2, q^2, 2pq)
+        for ( DiploidGenotype g : DiploidGenotype.values() ) {
+            alleleFreqPriors[g.ordinal()] = log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base1)] + log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base2)];
+            // add a factor of 2 for the 2pq case
+            if ( g.isHet() )
+                alleleFreqPriors[g.ordinal()] += Math.log10(2);
+        }
+
+        return new DiploidGenotypePriors(alleleFreqPriors);
+    }
+
+    private class PerSampleGenotypeLikelihoods {
+
+        GenotypeLikelihoods forwardStrandGL;
+        GenotypeLikelihoods reverseStrandGL;
+        GenotypeLikelihoods totalGL;
+
+        public PerSampleGenotypeLikelihoods() {
+            forwardStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform);
+            forwardStrandGL.setVerbose(VERBOSE);
+            reverseStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform);
+            reverseStrandGL.setVerbose(VERBOSE);
+        }
+
+        public void addObservedBase(char base, SAMRecord read, int offset) {
+            if ( !read.getReadNegativeStrandFlag() )
+                forwardStrandGL.add(base, read.getBaseQualities()[offset], read, offset);
+            else
+                reverseStrandGL.add(base, read.getBaseQualities()[offset], read, offset);
+        }
+
+        public void createGLs(DiploidGenotypePriors priors) {
+            setPriors(priors);
+
+            // GL_i = GL+_i + GL-_i
+            totalGL = GenotypeLikelihoods.combineLikelihoods(forwardStrandGL, reverseStrandGL);
+            totalGL.setVerbose(VERBOSE);
+            if ( VERBOSE )
+                totalGL.validate();
+        }
+
+        public void setPriors(DiploidGenotypePriors priors) {
+            if ( forwardStrandGL != null )
+                forwardStrandGL.setPriors(priors);
+            if ( reverseStrandGL != null )
+                reverseStrandGL.setPriors(priors);
+            if ( totalGL != null )
+                totalGL.setPriors(priors);
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java
@ -4,38 +4,31 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.utils.*;
 import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
+import org.apache.log4j.Logger;

 import java.util.Set;


 public class SingleSampleGenotypeCalculationModel extends GenotypeCalculationModel {

-    private CallResult callsMetrics;
+    private CallResult callsMetrics = new CallResult();

-    public SingleSampleGenotypeCalculationModel(BaseMismatchModel baseModel,
-                                                Set<String> samples,
-                                                EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
-                                                GenotypeWriter out,
-                                                boolean genotypeMode,
-                                                double lod,
-                                                int maxDeletions,
-                                                boolean verbose) {
-        super(baseModel, samples, platform, out, genotypeMode, lod, maxDeletions, verbose);
+    protected SingleSampleGenotypeCalculationModel() {}
+
+    public void initialize(BaseMismatchModel baseModel,
+                           Set<String> samples,
+                           EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
+                           GenotypeWriter out,
+                           Logger logger,
+                           boolean genotypeMode,
+                           double lod,
+                           int maxDeletions,
+                           boolean verbose) {
+        super.initialize(baseModel, samples, platform, out, logger, genotypeMode, lod, maxDeletions, verbose);

        // check that this truly is a single-sample bam
        if ( samples.size() > 1 )
            throw new StingException("Single-sample genotype calculation model used on multi-sample bam... aborting");
-
-        callsMetrics = new CallResult();
-    }
-
-    /**
-     * Cloning of the object
-     * @return clone
-     * @throws CloneNotSupportedException
-     */
-    protected Object clone() throws CloneNotSupportedException {
-        return super.clone();
    }

    public boolean calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@ -95,7 +95,7 @@ public class UnifiedGenotyper extends LocusWalker<Integer, Integer> {
        else
            writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out);

-        gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(UAC, samples, writer);
+        gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(UAC, samples, writer, logger);
    }

    /**