-Cleanup of the Joint Estimation code

-Don't print verbose/debugging output to logger, but instead specify a file in the argument collection (and then we only need to print conditionally)


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1899 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-10-22 15:25:29 +00:00
parent 2cab4c68d4
commit 54c61c663c
5 changed files with 76 additions and 84 deletions

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import java.io.*;
import java.util.*; import java.util.*;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
@ -36,7 +37,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
protected double MINIMUM_ALLELE_FREQUENCY; protected double MINIMUM_ALLELE_FREQUENCY;
protected int maxDeletionsInPileup; protected int maxDeletionsInPileup;
protected String assumedSingleSample; protected String assumedSingleSample;
protected boolean VERBOSE; protected PrintWriter verboseWriter;
/** /**
* Create a new GenotypeCalculationModel object * Create a new GenotypeCalculationModel object
@ -66,36 +67,26 @@ public abstract class GenotypeCalculationModel implements Cloneable {
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY; MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
maxDeletionsInPileup = UAC.MAX_DELETIONS; maxDeletionsInPileup = UAC.MAX_DELETIONS;
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE; assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
VERBOSE = UAC.VERBOSE; if ( UAC.VERBOSE != null ) {
try {
verboseWriter = new PrintWriter(UAC.VERBOSE);
} catch (FileNotFoundException e) {
throw new StingException("Could not open file " + UAC.VERBOSE + " for writing");
}
} }
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone();
gcm.samples = new HashSet<String>(samples);
gcm.logger = logger;
gcm.baseModel = baseModel;
gcm.heterozygosity = heterozygosity;
gcm.defaultPlatform = defaultPlatform;
gcm.GENOTYPE_MODE = GENOTYPE_MODE;
gcm.POOLED_INPUT = POOLED_INPUT;
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
gcm.MINIMUM_ALLELE_FREQUENCY = MINIMUM_ALLELE_FREQUENCY;
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
gcm.assumedSingleSample = assumedSingleSample;
gcm.VERBOSE = VERBOSE;
return gcm;
} }
public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) { public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) {
// just re-initialize // just close and re-initialize
close();
initialize(this.samples, this.logger, UAC); initialize(this.samples, this.logger, UAC);
} }
public void close() {
if ( verboseWriter != null )
verboseWriter.close();
}
/** /**
* Must be overridden by concrete subclasses * Must be overridden by concrete subclasses
* @param tracker rod data * @param tracker rod data

View File

@ -17,7 +17,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
// because the Hardy-Weinberg values for a given frequency are constant, // because the Hardy-Weinberg values for a given frequency are constant,
// we cache the results to avoid having to recompute everything // we cache the results to avoid having to recompute everything
private HashMap<Double, Pair<double[], double[]>> hardyWeinbergValueCache = new HashMap<Double, Pair<double[], double[]>>(); private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();
// the allele frequency priors // the allele frequency priors
private double[] log10AlleleFrequencyPriors; private double[] log10AlleleFrequencyPriors;
@ -44,7 +44,11 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
if ( contexts == null ) if ( contexts == null )
return null; return null;
calculate(ref, contexts, StratifiedContext.OVERALL); initializeAlleleFrequencies(contexts.size());
// run joint estimation for the full GL contexts
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
calculateAlleleFrequencyPosteriors(ref, context.getLocation());
//double lod = overall.getPofD() - overall.getPofNull(); //double lod = overall.getPofD() - overall.getPofNull();
@ -66,15 +70,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
return null; return null;
} }
private void calculate(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
initializeAlleleFrequencies(contexts.size());
initializeGenotypeLikelihoods(ref, contexts, contextType);
calculateAlleleFrequencyPosteriors(ref);
}
private void initializeAlleleFrequencies(int numSamples) { private void initializeAlleleFrequencies(int numSamples) {
// calculate the number of estimation points to use: // calculate the number of estimation points to use:
@ -84,9 +79,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
// set up the allele frequency priors // set up the allele frequency priors
log10AlleleFrequencyPriors = getNullAlleleFrequencyPriors(frequencyEstimationPoints); log10AlleleFrequencyPriors = getNullAlleleFrequencyPriors(frequencyEstimationPoints);
for (int i = 0; i < frequencyEstimationPoints; i++)
logger.debug("Null allele frequency for MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + log10AlleleFrequencyPriors[i]);
} }
private double[] getNullAlleleFrequencyPriors(int N) { private double[] getNullAlleleFrequencyPriors(int N) {
@ -124,20 +116,22 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
private void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) { private void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
GLs.clear(); GLs.clear();
// use flat priors for GLs
DiploidGenotypePriors priors = new DiploidGenotypePriors();
for ( String sample : contexts.keySet() ) { for ( String sample : contexts.keySet() ) {
AlignmentContextBySample context = contexts.get(sample); AlignmentContextBySample context = contexts.get(sample);
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType)); ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
// create the GenotypeLikelihoods object // create the GenotypeLikelihoods object
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform); GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
GL.setVerbose(VERBOSE);
GL.add(pileup, true); GL.add(pileup, true);
GLs.put(sample, GL); GLs.put(sample, GL);
} }
} }
private void calculateAlleleFrequencyPosteriors(char ref) { private void calculateAlleleFrequencyPosteriors(char ref, GenomeLoc verboseLocation) {
// initialization // initialization
for ( char altAllele : BaseUtils.BASES ) { for ( char altAllele : BaseUtils.BASES ) {
@ -151,14 +145,9 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
for (int i = 0; i < frequencyEstimationPoints; i++) { for (int i = 0; i < frequencyEstimationPoints; i++) {
double f = (double)i / (double)(frequencyEstimationPoints-1); double f = (double)i / (double)(frequencyEstimationPoints-1);
DiploidGenotypePriors hardyWeinberg = getHardyWeinbergValues(f, ref);
// for each sample // for each sample
for ( GenotypeLikelihoods GL : GLs.values() ) { for ( GenotypeLikelihoods GL : GLs.values() ) {
// set the GL prior to be the AF priors and get the corresponding posteriors
//GL.setPriors(hardyWeinberg);
GL.setPriors(new DiploidGenotypePriors());
double[] posteriors = GL.getPosteriors(); double[] posteriors = GL.getPosteriors();
// get the ref data // get the ref data
@ -180,7 +169,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
//logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]); //logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]);
// calculate the posterior weighted frequencies // calculate the posterior weighted frequencies
double[] HWvalues = hardyWeinbergValueCache.get(f).first; double[] HWvalues = getHardyWeinbergValues(f);
double PofDgivenAFi = 0.0; double PofDgivenAFi = 0.0;
PofDgivenAFi += HWvalues[GenotypeType.REF.ordinal()] * allelePosteriors[GenotypeType.REF.ordinal()]; PofDgivenAFi += HWvalues[GenotypeType.REF.ordinal()] * allelePosteriors[GenotypeType.REF.ordinal()];
PofDgivenAFi += HWvalues[GenotypeType.HET.ordinal()] * allelePosteriors[GenotypeType.HET.ordinal()]; PofDgivenAFi += HWvalues[GenotypeType.HET.ordinal()] * allelePosteriors[GenotypeType.HET.ordinal()];
@ -197,28 +186,25 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
for (int i = 0; i < frequencyEstimationPoints; i++)
logger.debug("P(D|AF=" + i + ") for alt allele " + altAllele + ": " + log10PofDgivenAFi[baseIndex][i]);
// multiply by null allele frequency priors to get AF posteriors, then normalize // multiply by null allele frequency priors to get AF posteriors, then normalize
for (int i = 0; i < frequencyEstimationPoints; i++) for (int i = 0; i < frequencyEstimationPoints; i++)
alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i]; alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]); normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);
for (int i = 0; i < frequencyEstimationPoints; i++)
logger.debug("Allele frequency posterior estimate for alt allele " + altAllele + " MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + alleleFrequencyPosteriors[baseIndex][i]);
// calculate p(f>0) // calculate p(f>0)
double sum = 0.0; double sum = 0.0;
for (int i = 1; i < frequencyEstimationPoints; i++) for (int i = 1; i < frequencyEstimationPoints; i++)
sum += alleleFrequencyPosteriors[baseIndex][i]; sum += alleleFrequencyPosteriors[baseIndex][i];
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
logger.info("P(f>0), LOD for alt allele " + altAllele + ": " + Math.log10(PofFs[baseIndex]) + ", " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
}
} }
private DiploidGenotypePriors getHardyWeinbergValues(double f, char ref) { // print out stats if we have a position and a writer
Pair<double[], double[]> HWvalues = hardyWeinbergValueCache.get(f); if ( verboseLocation != null && verboseWriter != null )
printAlleleFrequencyData(ref, verboseLocation);
}
private double[] getHardyWeinbergValues(double f) {
double[] HWvalues = hardyWeinbergValueCache.get(f);
// if it hasn't been calculated yet, do so now // if it hasn't been calculated yet, do so now
if ( HWvalues == null ) { if ( HWvalues == null ) {
@ -236,26 +222,11 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
q -= MINIMUM_ALLELE_FREQUENCY; q -= MINIMUM_ALLELE_FREQUENCY;
} }
double[] freqs = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) }; HWvalues = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) };
double[] log10Freqs = new double[] { Math.log10(freqs[0]), Math.log10(freqs[1]), Math.log10(freqs[2]) };
HWvalues = new Pair<double[], double[]>(freqs, log10Freqs);
hardyWeinbergValueCache.put(f, HWvalues); hardyWeinbergValueCache.put(f, HWvalues);
} }
// create the priors based on the HW frequencies return HWvalues;
double[] alleleFreqPriors = new double[10];
for ( DiploidGenotype g : DiploidGenotype.values() ) {
if ( g.isHomRef(ref) )
alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.REF.ordinal()];
else if ( g.isHomVar(ref) )
alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HOM.ordinal()];
else if ( g.isHetRef(ref) )
alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HET.ordinal()];
else // g is het non-ref
alleleFreqPriors[g.ordinal()] = 0.0;
}
return new DiploidGenotypePriors(alleleFreqPriors);
} }
private void normalizeFromLog10(double[] array) { private void normalizeFromLog10(double[] array) {
@ -281,4 +252,39 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
} }
return max; return max;
} }
private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
verboseWriter.println("Location=" + loc + ", ref=" + ref);
StringBuilder header = new StringBuilder("MAF\tNullAFpriors\t");
for ( char altAllele : BaseUtils.BASES ) {
if ( altAllele != ref ) {
char base = Character.toLowerCase(altAllele);
header.append("P(D|AF)_" + base + "\t");
header.append("PosteriorAF_" + base + "\t");
}
}
verboseWriter.println(header);
for (int i = 0; i < frequencyEstimationPoints; i++) {
StringBuilder AFline = new StringBuilder(i + "/" + (frequencyEstimationPoints-1) + "\t" + String.format("%.8f", log10AlleleFrequencyPriors[i]) + "\t");
for ( char altAllele : BaseUtils.BASES ) {
if ( altAllele != ref ) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
AFline.append(String.format("%.8f\t%.8f\t", log10PofDgivenAFi[baseIndex][i], alleleFrequencyPosteriors[baseIndex][i]));
}
}
verboseWriter.println(AFline);
}
for ( char altAllele : BaseUtils.BASES ) {
if ( altAllele != ref ) {
char base = Character.toLowerCase(altAllele);
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex]));
verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
}
}
verboseWriter.println();
}
} }

View File

@ -60,7 +60,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
// create the GenotypeLikelihoods object // create the GenotypeLikelihoods object
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
GL.setVerbose(VERBOSE);
GL.add(pileup, true); GL.add(pileup, true);
return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL); return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
} }
@ -85,7 +84,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
// create the GenotypeLikelihoods object // create the GenotypeLikelihoods object
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, AFPriors, defaultPlatform); GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, AFPriors, defaultPlatform);
GL.setVerbose(VERBOSE);
GL.add(pileup, true); GL.add(pileup, true);
GLs.put(sample, GL); GLs.put(sample, GL);

View File

@ -51,8 +51,8 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false) @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
public boolean GENOTYPE = false; public boolean GENOTYPE = false;
@Argument(fullName = "verbose", shortName = "v", doc = "EXPERIMENTAL", required = false) @Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
public boolean VERBOSE = false; public String VERBOSE = null;
// control the error modes // control the error modes
@ -75,8 +75,4 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "min_allele_frequency", shortName = "minFreq", doc = "The minimum possible allele frequency in a population (advanced)", required = false) @Argument(fullName = "min_allele_frequency", shortName = "minFreq", doc = "The minimum possible allele frequency in a population (advanced)", required = false)
public double MINIMUM_ALLELE_FREQUENCY = 1e-8; public double MINIMUM_ALLELE_FREQUENCY = 1e-8;
//@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system. This argument is for testing purposes only", required = false)
//public boolean disableCache = false;
} }

View File

@ -254,10 +254,11 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
return sum + 1; return sum + 1;
} }
/** Close the variant file. */ // Close any file writers
public void onTraversalDone(Integer sum) { public void onTraversalDone(Integer sum) {
logger.info("Processed " + sum + " loci that are callable for SNPs");
writer.close(); writer.close();
gcm.close();
logger.info("Processed " + sum + " loci that are callable for SNPs");
} }
/** /**