From 54c61c663c3de17d06b31ba9a556b4437670b5f3 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 22 Oct 2009 15:25:29 +0000 Subject: [PATCH] -Cleanup of the Joint Estimation code -Don't print verbose/debugging output to logger, but instead specify a file in the argument collection (and then we only need to print conditionally) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1899 348d0f76-0448-11de-a6fe-93d51630548a --- .../genotyper/GenotypeCalculationModel.java | 41 +++---- ...JointEstimateGenotypeCalculationModel.java | 104 +++++++++--------- ...PointEstimateGenotypeCalculationModel.java | 2 - .../genotyper/UnifiedArgumentCollection.java | 8 +- .../walkers/genotyper/UnifiedGenotyper.java | 5 +- 5 files changed, 76 insertions(+), 84 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index 456ee1fb3..f62d181ce 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.apache.log4j.Logger; +import java.io.*; import java.util.*; import net.sf.samtools.SAMRecord; @@ -36,7 +37,7 @@ public abstract class GenotypeCalculationModel implements Cloneable { protected double MINIMUM_ALLELE_FREQUENCY; protected int maxDeletionsInPileup; protected String assumedSingleSample; - protected boolean VERBOSE; + protected PrintWriter verboseWriter; /** * Create a new GenotypeCalculationModel object @@ -66,36 +67,26 @@ public abstract class GenotypeCalculationModel implements Cloneable { MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY; maxDeletionsInPileup = UAC.MAX_DELETIONS; assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE; - VERBOSE = UAC.VERBOSE; - } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone(); - gcm.samples = new HashSet(samples); - gcm.logger = logger; - gcm.baseModel = baseModel; - gcm.heterozygosity = heterozygosity; - gcm.defaultPlatform = defaultPlatform; - gcm.GENOTYPE_MODE = GENOTYPE_MODE; - gcm.POOLED_INPUT = POOLED_INPUT; - gcm.LOD_THRESHOLD = LOD_THRESHOLD; - gcm.MINIMUM_ALLELE_FREQUENCY = MINIMUM_ALLELE_FREQUENCY; - gcm.maxDeletionsInPileup = maxDeletionsInPileup; - gcm.assumedSingleSample = assumedSingleSample; - gcm.VERBOSE = VERBOSE; - return gcm; + if ( UAC.VERBOSE != null ) { + try { + verboseWriter = new PrintWriter(UAC.VERBOSE); + } catch (FileNotFoundException e) { + throw new StingException("Could not open file " + UAC.VERBOSE + " for writing"); + } + } } public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) { - // just re-initialize + // just close and re-initialize + close(); initialize(this.samples, this.logger, UAC); } + public void close() { + if ( verboseWriter != null ) + verboseWriter.close(); + } + /** * Must be overridden by concrete subclasses * @param tracker rod data diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java index 57a94f5af..ec341ece2 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -17,7 +17,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo // because the Hardy-Weinberg values for a given frequency are constant, // we cache the results to avoid having to recompute everything - private HashMap> hardyWeinbergValueCache = new HashMap>(); + private HashMap hardyWeinbergValueCache = new HashMap(); // the allele frequency priors private double[] log10AlleleFrequencyPriors; @@ -44,7 +44,11 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo if ( contexts == null ) return null; - calculate(ref, contexts, StratifiedContext.OVERALL); + initializeAlleleFrequencies(contexts.size()); + + // run joint estimation for the full GL contexts + initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL); + calculateAlleleFrequencyPosteriors(ref, context.getLocation()); //double lod = overall.getPofD() - overall.getPofNull(); @@ -66,15 +70,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo return null; } - private void calculate(char ref, HashMap contexts, StratifiedContext contextType) { - - initializeAlleleFrequencies(contexts.size()); - - initializeGenotypeLikelihoods(ref, contexts, contextType); - - calculateAlleleFrequencyPosteriors(ref); - } - private void initializeAlleleFrequencies(int numSamples) { // calculate the number of estimation points to use: @@ -84,9 +79,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo // set up the allele frequency priors log10AlleleFrequencyPriors = getNullAlleleFrequencyPriors(frequencyEstimationPoints); - - for (int i = 0; i < frequencyEstimationPoints; i++) - logger.debug("Null allele frequency for MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + log10AlleleFrequencyPriors[i]); } private double[] getNullAlleleFrequencyPriors(int N) { @@ -124,20 +116,22 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo private void initializeGenotypeLikelihoods(char ref, HashMap contexts, StratifiedContext contextType) { GLs.clear(); + // use flat priors for GLs + DiploidGenotypePriors priors = new DiploidGenotypePriors(); + for ( String sample : contexts.keySet() ) { AlignmentContextBySample context = contexts.get(sample); ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType)); // create the GenotypeLikelihoods object - GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform); - GL.setVerbose(VERBOSE); + GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); GL.add(pileup, true); GLs.put(sample, GL); } } - private void calculateAlleleFrequencyPosteriors(char ref) { + private void calculateAlleleFrequencyPosteriors(char ref, GenomeLoc verboseLocation) { // initialization for ( char altAllele : BaseUtils.BASES ) { @@ -151,14 +145,9 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo for (int i = 0; i < frequencyEstimationPoints; i++) { double f = (double)i / (double)(frequencyEstimationPoints-1); - DiploidGenotypePriors hardyWeinberg = getHardyWeinbergValues(f, ref); - // for each sample for ( GenotypeLikelihoods GL : GLs.values() ) { - // set the GL prior to be the AF priors and get the corresponding posteriors - //GL.setPriors(hardyWeinberg); - GL.setPriors(new DiploidGenotypePriors()); double[] posteriors = GL.getPosteriors(); // get the ref data @@ -180,7 +169,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo //logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]); // calculate the posterior weighted frequencies - double[] HWvalues = hardyWeinbergValueCache.get(f).first; + double[] HWvalues = getHardyWeinbergValues(f); double PofDgivenAFi = 0.0; PofDgivenAFi += HWvalues[GenotypeType.REF.ordinal()] * allelePosteriors[GenotypeType.REF.ordinal()]; PofDgivenAFi += HWvalues[GenotypeType.HET.ordinal()] * allelePosteriors[GenotypeType.HET.ordinal()]; @@ -197,28 +186,25 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); - for (int i = 0; i < frequencyEstimationPoints; i++) - logger.debug("P(D|AF=" + i + ") for alt allele " + altAllele + ": " + log10PofDgivenAFi[baseIndex][i]); - // multiply by null allele frequency priors to get AF posteriors, then normalize for (int i = 0; i < frequencyEstimationPoints; i++) alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i]; normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]); - for (int i = 0; i < frequencyEstimationPoints; i++) - logger.debug("Allele frequency posterior estimate for alt allele " + altAllele + " MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + alleleFrequencyPosteriors[baseIndex][i]); - // calculate p(f>0) double sum = 0.0; for (int i = 1; i < frequencyEstimationPoints; i++) sum += alleleFrequencyPosteriors[baseIndex][i]; PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors - logger.info("P(f>0), LOD for alt allele " + altAllele + ": " + Math.log10(PofFs[baseIndex]) + ", " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0]))); } + + // print out stats if we have a position and a writer + if ( verboseLocation != null && verboseWriter != null ) + printAlleleFrequencyData(ref, verboseLocation); } - private DiploidGenotypePriors getHardyWeinbergValues(double f, char ref) { - Pair HWvalues = hardyWeinbergValueCache.get(f); + private double[] getHardyWeinbergValues(double f) { + double[] HWvalues = hardyWeinbergValueCache.get(f); // if it hasn't been calculated yet, do so now if ( HWvalues == null ) { @@ -236,26 +222,11 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo q -= MINIMUM_ALLELE_FREQUENCY; } - double[] freqs = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) }; - double[] log10Freqs = new double[] { Math.log10(freqs[0]), Math.log10(freqs[1]), Math.log10(freqs[2]) }; - HWvalues = new Pair(freqs, log10Freqs); + HWvalues = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) }; hardyWeinbergValueCache.put(f, HWvalues); } - // create the priors based on the HW frequencies - double[] alleleFreqPriors = new double[10]; - for ( DiploidGenotype g : DiploidGenotype.values() ) { - if ( g.isHomRef(ref) ) - alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.REF.ordinal()]; - else if ( g.isHomVar(ref) ) - alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HOM.ordinal()]; - else if ( g.isHetRef(ref) ) - alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HET.ordinal()]; - else // g is het non-ref - alleleFreqPriors[g.ordinal()] = 0.0; - } - - return new DiploidGenotypePriors(alleleFreqPriors); + return HWvalues; } private void normalizeFromLog10(double[] array) { @@ -281,4 +252,39 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo } return max; } + + private void printAlleleFrequencyData(char ref, GenomeLoc loc) { + + verboseWriter.println("Location=" + loc + ", ref=" + ref); + StringBuilder header = new StringBuilder("MAF\tNullAFpriors\t"); + for ( char altAllele : BaseUtils.BASES ) { + if ( altAllele != ref ) { + char base = Character.toLowerCase(altAllele); + header.append("P(D|AF)_" + base + "\t"); + header.append("PosteriorAF_" + base + "\t"); + } + } + verboseWriter.println(header); + + for (int i = 0; i < frequencyEstimationPoints; i++) { + StringBuilder AFline = new StringBuilder(i + "/" + (frequencyEstimationPoints-1) + "\t" + String.format("%.8f", log10AlleleFrequencyPriors[i]) + "\t"); + for ( char altAllele : BaseUtils.BASES ) { + if ( altAllele != ref ) { + int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); + AFline.append(String.format("%.8f\t%.8f\t", log10PofDgivenAFi[baseIndex][i], alleleFrequencyPosteriors[baseIndex][i])); + } + } + verboseWriter.println(AFline); + } + + for ( char altAllele : BaseUtils.BASES ) { + if ( altAllele != ref ) { + char base = Character.toLowerCase(altAllele); + int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); + verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex])); + verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0]))); + } + } + verboseWriter.println(); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index 73f0ce3e2..b7c645d52 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -60,7 +60,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation // create the GenotypeLikelihoods object GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); - GL.setVerbose(VERBOSE); GL.add(pileup, true); return new Pair(pileup, GL); } @@ -85,7 +84,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation // create the GenotypeLikelihoods object GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, AFPriors, defaultPlatform); - GL.setVerbose(VERBOSE); GL.add(pileup, true); GLs.put(sample, GL); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 45ea803bb..1609e0de6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -51,8 +51,8 @@ public class UnifiedArgumentCollection { @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false) public boolean GENOTYPE = false; - @Argument(fullName = "verbose", shortName = "v", doc = "EXPERIMENTAL", required = false) - public boolean VERBOSE = false; + @Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false) + public String VERBOSE = null; // control the error modes @@ -75,8 +75,4 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_allele_frequency", shortName = "minFreq", doc = "The minimum possible allele frequency in a population (advanced)", required = false) public double MINIMUM_ALLELE_FREQUENCY = 1e-8; - - //@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system. This argument is for testing purposes only", required = false) - //public boolean disableCache = false; - } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 66a47b4d1..3de0064ad 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -254,10 +254,11 @@ public class UnifiedGenotyper extends LocusWalker, Genot return sum + 1; } - /** Close the variant file. */ + // Close any file writers public void onTraversalDone(Integer sum) { - logger.info("Processed " + sum + " loci that are callable for SNPs"); writer.close(); + gcm.close(); + logger.info("Processed " + sum + " loci that are callable for SNPs"); } /**