-Cleanup of the Joint Estimation code

-Don't print verbose/debugging output to logger, but instead specify a file in the argument collection (and then we only need to print conditionally) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1899 348d0f76-0448-11de-a6fe-93d51630548a
2009-10-22 15:25:29 +00:00 · 2009-10-22 15:25:29 +00:00 · 54c61c663c
parent 2cab4c68d4
commit 54c61c663c
5 changed files with 76 additions and 84 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
@ -8,6 +8,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.StingException;
 import org.apache.log4j.Logger;

+import java.io.*;
 import java.util.*;

 import net.sf.samtools.SAMRecord;
@ -36,7 +37,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
    protected double MINIMUM_ALLELE_FREQUENCY;
    protected int maxDeletionsInPileup;
    protected String assumedSingleSample;
-    protected boolean VERBOSE;
+    protected PrintWriter verboseWriter;

    /**
     * Create a new GenotypeCalculationModel object
@ -66,36 +67,26 @@ public abstract class GenotypeCalculationModel implements Cloneable {
        MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
        maxDeletionsInPileup = UAC.MAX_DELETIONS;
        assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
-        VERBOSE = UAC.VERBOSE;
-    }
-
-    /**
-     * Cloning of the object
-     * @return clone
-     * @throws CloneNotSupportedException
-     */
-    protected Object clone() throws CloneNotSupportedException {
-        GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone();
-        gcm.samples = new HashSet<String>(samples);
-        gcm.logger = logger;
-        gcm.baseModel = baseModel;
-        gcm.heterozygosity = heterozygosity;
-        gcm.defaultPlatform = defaultPlatform;
-        gcm.GENOTYPE_MODE = GENOTYPE_MODE;
-        gcm.POOLED_INPUT = POOLED_INPUT;
-        gcm.LOD_THRESHOLD = LOD_THRESHOLD;
-        gcm.MINIMUM_ALLELE_FREQUENCY = MINIMUM_ALLELE_FREQUENCY;
-        gcm.maxDeletionsInPileup = maxDeletionsInPileup;
-        gcm.assumedSingleSample = assumedSingleSample;
-        gcm.VERBOSE = VERBOSE;
-        return gcm;
+        if ( UAC.VERBOSE != null ) {
+            try {
+                verboseWriter = new PrintWriter(UAC.VERBOSE);
+            } catch (FileNotFoundException e) {
+                throw new StingException("Could not open file " + UAC.VERBOSE + " for writing");
+            }
+        }
    }

    public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) {
-        // just re-initialize
+        // just close and re-initialize
+        close();
        initialize(this.samples, this.logger, UAC);
    }

+    public void close() {
+        if ( verboseWriter != null )
+            verboseWriter.close();
+    }
+
    /**
     * Must be overridden by concrete subclasses
     * @param tracker   rod data
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
@ -17,7 +17,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo

    // because the Hardy-Weinberg values for a given frequency are constant,
    // we cache the results to avoid having to recompute everything
-    private HashMap<Double, Pair<double[], double[]>> hardyWeinbergValueCache = new HashMap<Double, Pair<double[], double[]>>();
+    private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();

    // the allele frequency priors
    private double[] log10AlleleFrequencyPriors;
@ -44,7 +44,11 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
        if ( contexts == null )
            return null;

-        calculate(ref, contexts, StratifiedContext.OVERALL);
+        initializeAlleleFrequencies(contexts.size());
+
+        // run joint estimation for the full GL contexts
+        initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
+        calculateAlleleFrequencyPosteriors(ref, context.getLocation());


        //double lod = overall.getPofD() - overall.getPofNull();
@ -66,15 +70,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
        return null;
    }

-    private void calculate(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
-
-        initializeAlleleFrequencies(contexts.size());
-
-        initializeGenotypeLikelihoods(ref, contexts, contextType);
-
-        calculateAlleleFrequencyPosteriors(ref);
-    }
-
    private void initializeAlleleFrequencies(int numSamples) {

        // calculate the number of estimation points to use:
@ -84,9 +79,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo

        // set up the allele frequency priors
        log10AlleleFrequencyPriors = getNullAlleleFrequencyPriors(frequencyEstimationPoints);
-
-        for (int i = 0; i < frequencyEstimationPoints; i++)
-            logger.debug("Null allele frequency for MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + log10AlleleFrequencyPriors[i]);
    }

    private double[] getNullAlleleFrequencyPriors(int N) {
@ -124,20 +116,22 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
    private void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
        GLs.clear();

+        // use flat priors for GLs
+        DiploidGenotypePriors priors = new DiploidGenotypePriors();
+
        for ( String sample : contexts.keySet() ) {
            AlignmentContextBySample context = contexts.get(sample);
            ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));

            // create the GenotypeLikelihoods object
-            GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform);
-            GL.setVerbose(VERBOSE);
+            GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
            GL.add(pileup, true);

            GLs.put(sample, GL);
        }
    }

-    private void calculateAlleleFrequencyPosteriors(char ref) {
+    private void calculateAlleleFrequencyPosteriors(char ref, GenomeLoc verboseLocation) {

        // initialization
        for ( char altAllele : BaseUtils.BASES ) {
@ -151,14 +145,9 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
        for (int i = 0; i < frequencyEstimationPoints; i++) {
            double f = (double)i / (double)(frequencyEstimationPoints-1);

-            DiploidGenotypePriors hardyWeinberg = getHardyWeinbergValues(f, ref);
-
            // for each sample
            for ( GenotypeLikelihoods GL : GLs.values() ) {

-                // set the GL prior to be the AF priors and get the corresponding posteriors
-                //GL.setPriors(hardyWeinberg);
-                GL.setPriors(new DiploidGenotypePriors());
                double[] posteriors = GL.getPosteriors();

                // get the ref data
@ -180,7 +169,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
                    //logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]);

                    // calculate the posterior weighted frequencies
-                    double[] HWvalues = hardyWeinbergValueCache.get(f).first;
+                    double[] HWvalues = getHardyWeinbergValues(f);
                    double PofDgivenAFi = 0.0;
                    PofDgivenAFi += HWvalues[GenotypeType.REF.ordinal()] * allelePosteriors[GenotypeType.REF.ordinal()];
                    PofDgivenAFi += HWvalues[GenotypeType.HET.ordinal()] * allelePosteriors[GenotypeType.HET.ordinal()];
@ -197,28 +186,25 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo

            int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);

-            for (int i = 0; i < frequencyEstimationPoints; i++)
-                logger.debug("P(D|AF=" + i + ") for alt allele " + altAllele + ": " + log10PofDgivenAFi[baseIndex][i]);
-
            // multiply by null allele frequency priors to get AF posteriors, then normalize
            for (int i = 0; i < frequencyEstimationPoints; i++)
                alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
            normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);

-            for (int i = 0; i < frequencyEstimationPoints; i++)
-                logger.debug("Allele frequency posterior estimate for alt allele " + altAllele + " MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + alleleFrequencyPosteriors[baseIndex][i]);
-
            // calculate p(f>0)
            double sum = 0.0;
            for (int i = 1; i < frequencyEstimationPoints; i++)
                sum += alleleFrequencyPosteriors[baseIndex][i];
            PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
-            logger.info("P(f>0), LOD for alt allele " + altAllele + ": " + Math.log10(PofFs[baseIndex]) + ", " +  (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
        }
+
+        // print out stats if we have a position and a writer
+        if ( verboseLocation != null && verboseWriter != null )
+            printAlleleFrequencyData(ref, verboseLocation);
    }

-    private DiploidGenotypePriors getHardyWeinbergValues(double f, char ref) {
-        Pair<double[], double[]> HWvalues = hardyWeinbergValueCache.get(f);
+    private double[] getHardyWeinbergValues(double f) {
+        double[] HWvalues = hardyWeinbergValueCache.get(f);

        // if it hasn't been calculated yet, do so now
        if ( HWvalues == null ) {
@ -236,26 +222,11 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
                q -= MINIMUM_ALLELE_FREQUENCY;
            }

-            double[] freqs = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) };
-            double[] log10Freqs = new double[] { Math.log10(freqs[0]), Math.log10(freqs[1]), Math.log10(freqs[2]) };
-            HWvalues = new Pair<double[], double[]>(freqs, log10Freqs);
+            HWvalues = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) };
            hardyWeinbergValueCache.put(f, HWvalues);
        }

-        // create the priors based on the HW frequencies
-        double[] alleleFreqPriors = new double[10];
-        for ( DiploidGenotype g : DiploidGenotype.values() ) {
-            if ( g.isHomRef(ref) )
-                alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.REF.ordinal()];
-            else if ( g.isHomVar(ref) )
-                alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HOM.ordinal()];
-            else if ( g.isHetRef(ref) )
-                alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HET.ordinal()];
-            else // g is het non-ref
-                alleleFreqPriors[g.ordinal()] = 0.0;
-        }
-
-        return new DiploidGenotypePriors(alleleFreqPriors);
+        return HWvalues;
    }

    private void normalizeFromLog10(double[] array) {
@ -281,4 +252,39 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
        }
        return max;
    }
+
+    private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
+
+        verboseWriter.println("Location=" + loc + ", ref=" + ref);
+        StringBuilder header = new StringBuilder("MAF\tNullAFpriors\t");
+        for ( char altAllele : BaseUtils.BASES ) {
+            if ( altAllele != ref ) {
+                char base = Character.toLowerCase(altAllele);
+                header.append("P(D|AF)_" + base + "\t");
+                header.append("PosteriorAF_" + base + "\t");
+            }
+        }
+        verboseWriter.println(header);
+
+        for (int i = 0; i < frequencyEstimationPoints; i++) {
+            StringBuilder AFline = new StringBuilder(i + "/" + (frequencyEstimationPoints-1) + "\t" + String.format("%.8f", log10AlleleFrequencyPriors[i]) + "\t");
+            for ( char altAllele : BaseUtils.BASES ) {
+                if ( altAllele != ref ) {
+                    int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
+                    AFline.append(String.format("%.8f\t%.8f\t", log10PofDgivenAFi[baseIndex][i], alleleFrequencyPosteriors[baseIndex][i]));
+                }
+            }
+            verboseWriter.println(AFline);
+        }
+
+        for ( char altAllele : BaseUtils.BASES ) {
+            if ( altAllele != ref ) {
+                char base = Character.toLowerCase(altAllele);
+                int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
+                verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex]));
+                verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
+            }
+        }
+        verboseWriter.println();
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
@ -60,7 +60,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation

        // create the GenotypeLikelihoods object
        GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
-        GL.setVerbose(VERBOSE);
        GL.add(pileup, true);
        return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
    }
@ -85,7 +84,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation

            // create the GenotypeLikelihoods object
            GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, AFPriors, defaultPlatform);
-            GL.setVerbose(VERBOSE);
            GL.add(pileup, true);

            GLs.put(sample, GL);
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -51,8 +51,8 @@ public class UnifiedArgumentCollection {
    @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
    public boolean GENOTYPE = false;

-    @Argument(fullName = "verbose", shortName = "v", doc = "EXPERIMENTAL", required = false)
-    public boolean VERBOSE = false;
+    @Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
+    public String VERBOSE = null;


    // control the error modes
@ -75,8 +75,4 @@ public class UnifiedArgumentCollection {

    @Argument(fullName = "min_allele_frequency", shortName = "minFreq", doc = "The minimum possible allele frequency in a population (advanced)", required = false)
    public double MINIMUM_ALLELE_FREQUENCY = 1e-8;
-
-    //@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system.  This argument is for testing purposes only", required = false)
-    //public boolean disableCache = false;
-
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@ -254,10 +254,11 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
        return sum + 1;
    }

-    /** Close the variant file. */
+    // Close any file writers
    public void onTraversalDone(Integer sum) {
-        logger.info("Processed " + sum + " loci that are callable for SNPs");
        writer.close();
+        gcm.close();
+        logger.info("Processed " + sum + " loci that are callable for SNPs");
    }

    /**