-Cleanup of the Joint Estimation code
-Don't print verbose/debugging output to logger, but instead specify a file in the argument collection (and then we only need to print conditionally) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1899 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2cab4c68d4
commit
54c61c663c
|
|
@ -8,6 +8,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
|
@ -36,7 +37,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
protected double MINIMUM_ALLELE_FREQUENCY;
|
||||
protected int maxDeletionsInPileup;
|
||||
protected String assumedSingleSample;
|
||||
protected boolean VERBOSE;
|
||||
protected PrintWriter verboseWriter;
|
||||
|
||||
/**
|
||||
* Create a new GenotypeCalculationModel object
|
||||
|
|
@ -66,36 +67,26 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
|
||||
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
||||
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
|
||||
VERBOSE = UAC.VERBOSE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone();
|
||||
gcm.samples = new HashSet<String>(samples);
|
||||
gcm.logger = logger;
|
||||
gcm.baseModel = baseModel;
|
||||
gcm.heterozygosity = heterozygosity;
|
||||
gcm.defaultPlatform = defaultPlatform;
|
||||
gcm.GENOTYPE_MODE = GENOTYPE_MODE;
|
||||
gcm.POOLED_INPUT = POOLED_INPUT;
|
||||
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
|
||||
gcm.MINIMUM_ALLELE_FREQUENCY = MINIMUM_ALLELE_FREQUENCY;
|
||||
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
|
||||
gcm.assumedSingleSample = assumedSingleSample;
|
||||
gcm.VERBOSE = VERBOSE;
|
||||
return gcm;
|
||||
if ( UAC.VERBOSE != null ) {
|
||||
try {
|
||||
verboseWriter = new PrintWriter(UAC.VERBOSE);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Could not open file " + UAC.VERBOSE + " for writing");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) {
|
||||
// just re-initialize
|
||||
// just close and re-initialize
|
||||
close();
|
||||
initialize(this.samples, this.logger, UAC);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if ( verboseWriter != null )
|
||||
verboseWriter.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param tracker rod data
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
|
||||
// because the Hardy-Weinberg values for a given frequency are constant,
|
||||
// we cache the results to avoid having to recompute everything
|
||||
private HashMap<Double, Pair<double[], double[]>> hardyWeinbergValueCache = new HashMap<Double, Pair<double[], double[]>>();
|
||||
private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();
|
||||
|
||||
// the allele frequency priors
|
||||
private double[] log10AlleleFrequencyPriors;
|
||||
|
|
@ -44,7 +44,11 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
if ( contexts == null )
|
||||
return null;
|
||||
|
||||
calculate(ref, contexts, StratifiedContext.OVERALL);
|
||||
initializeAlleleFrequencies(contexts.size());
|
||||
|
||||
// run joint estimation for the full GL contexts
|
||||
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
|
||||
calculateAlleleFrequencyPosteriors(ref, context.getLocation());
|
||||
|
||||
|
||||
//double lod = overall.getPofD() - overall.getPofNull();
|
||||
|
|
@ -66,15 +70,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
return null;
|
||||
}
|
||||
|
||||
private void calculate(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
|
||||
initializeAlleleFrequencies(contexts.size());
|
||||
|
||||
initializeGenotypeLikelihoods(ref, contexts, contextType);
|
||||
|
||||
calculateAlleleFrequencyPosteriors(ref);
|
||||
}
|
||||
|
||||
private void initializeAlleleFrequencies(int numSamples) {
|
||||
|
||||
// calculate the number of estimation points to use:
|
||||
|
|
@ -84,9 +79,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
|
||||
// set up the allele frequency priors
|
||||
log10AlleleFrequencyPriors = getNullAlleleFrequencyPriors(frequencyEstimationPoints);
|
||||
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||
logger.debug("Null allele frequency for MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + log10AlleleFrequencyPriors[i]);
|
||||
}
|
||||
|
||||
private double[] getNullAlleleFrequencyPriors(int N) {
|
||||
|
|
@ -124,20 +116,22 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
private void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
GLs.clear();
|
||||
|
||||
// use flat priors for GLs
|
||||
DiploidGenotypePriors priors = new DiploidGenotypePriors();
|
||||
|
||||
for ( String sample : contexts.keySet() ) {
|
||||
AlignmentContextBySample context = contexts.get(sample);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform);
|
||||
GL.setVerbose(VERBOSE);
|
||||
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
GL.add(pileup, true);
|
||||
|
||||
GLs.put(sample, GL);
|
||||
}
|
||||
}
|
||||
|
||||
private void calculateAlleleFrequencyPosteriors(char ref) {
|
||||
private void calculateAlleleFrequencyPosteriors(char ref, GenomeLoc verboseLocation) {
|
||||
|
||||
// initialization
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
|
|
@ -151,14 +145,9 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
for (int i = 0; i < frequencyEstimationPoints; i++) {
|
||||
double f = (double)i / (double)(frequencyEstimationPoints-1);
|
||||
|
||||
DiploidGenotypePriors hardyWeinberg = getHardyWeinbergValues(f, ref);
|
||||
|
||||
// for each sample
|
||||
for ( GenotypeLikelihoods GL : GLs.values() ) {
|
||||
|
||||
// set the GL prior to be the AF priors and get the corresponding posteriors
|
||||
//GL.setPriors(hardyWeinberg);
|
||||
GL.setPriors(new DiploidGenotypePriors());
|
||||
double[] posteriors = GL.getPosteriors();
|
||||
|
||||
// get the ref data
|
||||
|
|
@ -180,7 +169,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
//logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]);
|
||||
|
||||
// calculate the posterior weighted frequencies
|
||||
double[] HWvalues = hardyWeinbergValueCache.get(f).first;
|
||||
double[] HWvalues = getHardyWeinbergValues(f);
|
||||
double PofDgivenAFi = 0.0;
|
||||
PofDgivenAFi += HWvalues[GenotypeType.REF.ordinal()] * allelePosteriors[GenotypeType.REF.ordinal()];
|
||||
PofDgivenAFi += HWvalues[GenotypeType.HET.ordinal()] * allelePosteriors[GenotypeType.HET.ordinal()];
|
||||
|
|
@ -197,28 +186,25 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||
logger.debug("P(D|AF=" + i + ") for alt allele " + altAllele + ": " + log10PofDgivenAFi[baseIndex][i]);
|
||||
|
||||
// multiply by null allele frequency priors to get AF posteriors, then normalize
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||
alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
|
||||
normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);
|
||||
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||
logger.debug("Allele frequency posterior estimate for alt allele " + altAllele + " MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + alleleFrequencyPosteriors[baseIndex][i]);
|
||||
|
||||
// calculate p(f>0)
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i < frequencyEstimationPoints; i++)
|
||||
sum += alleleFrequencyPosteriors[baseIndex][i];
|
||||
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
|
||||
logger.info("P(f>0), LOD for alt allele " + altAllele + ": " + Math.log10(PofFs[baseIndex]) + ", " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
|
||||
}
|
||||
|
||||
// print out stats if we have a position and a writer
|
||||
if ( verboseLocation != null && verboseWriter != null )
|
||||
printAlleleFrequencyData(ref, verboseLocation);
|
||||
}
|
||||
|
||||
private DiploidGenotypePriors getHardyWeinbergValues(double f, char ref) {
|
||||
Pair<double[], double[]> HWvalues = hardyWeinbergValueCache.get(f);
|
||||
private double[] getHardyWeinbergValues(double f) {
|
||||
double[] HWvalues = hardyWeinbergValueCache.get(f);
|
||||
|
||||
// if it hasn't been calculated yet, do so now
|
||||
if ( HWvalues == null ) {
|
||||
|
|
@ -236,26 +222,11 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
q -= MINIMUM_ALLELE_FREQUENCY;
|
||||
}
|
||||
|
||||
double[] freqs = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) };
|
||||
double[] log10Freqs = new double[] { Math.log10(freqs[0]), Math.log10(freqs[1]), Math.log10(freqs[2]) };
|
||||
HWvalues = new Pair<double[], double[]>(freqs, log10Freqs);
|
||||
HWvalues = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) };
|
||||
hardyWeinbergValueCache.put(f, HWvalues);
|
||||
}
|
||||
|
||||
// create the priors based on the HW frequencies
|
||||
double[] alleleFreqPriors = new double[10];
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
if ( g.isHomRef(ref) )
|
||||
alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.REF.ordinal()];
|
||||
else if ( g.isHomVar(ref) )
|
||||
alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HOM.ordinal()];
|
||||
else if ( g.isHetRef(ref) )
|
||||
alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HET.ordinal()];
|
||||
else // g is het non-ref
|
||||
alleleFreqPriors[g.ordinal()] = 0.0;
|
||||
}
|
||||
|
||||
return new DiploidGenotypePriors(alleleFreqPriors);
|
||||
return HWvalues;
|
||||
}
|
||||
|
||||
private void normalizeFromLog10(double[] array) {
|
||||
|
|
@ -281,4 +252,39 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
|
||||
|
||||
verboseWriter.println("Location=" + loc + ", ref=" + ref);
|
||||
StringBuilder header = new StringBuilder("MAF\tNullAFpriors\t");
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele != ref ) {
|
||||
char base = Character.toLowerCase(altAllele);
|
||||
header.append("P(D|AF)_" + base + "\t");
|
||||
header.append("PosteriorAF_" + base + "\t");
|
||||
}
|
||||
}
|
||||
verboseWriter.println(header);
|
||||
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++) {
|
||||
StringBuilder AFline = new StringBuilder(i + "/" + (frequencyEstimationPoints-1) + "\t" + String.format("%.8f", log10AlleleFrequencyPriors[i]) + "\t");
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele != ref ) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
AFline.append(String.format("%.8f\t%.8f\t", log10PofDgivenAFi[baseIndex][i], alleleFrequencyPosteriors[baseIndex][i]));
|
||||
}
|
||||
}
|
||||
verboseWriter.println(AFline);
|
||||
}
|
||||
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele != ref ) {
|
||||
char base = Character.toLowerCase(altAllele);
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex]));
|
||||
verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
|
||||
}
|
||||
}
|
||||
verboseWriter.println();
|
||||
}
|
||||
}
|
||||
|
|
@ -60,7 +60,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
|
||||
// create the GenotypeLikelihoods object
|
||||
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
GL.setVerbose(VERBOSE);
|
||||
GL.add(pileup, true);
|
||||
return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
|
||||
}
|
||||
|
|
@ -85,7 +84,6 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
|
||||
// create the GenotypeLikelihoods object
|
||||
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, AFPriors, defaultPlatform);
|
||||
GL.setVerbose(VERBOSE);
|
||||
GL.add(pileup, true);
|
||||
|
||||
GLs.put(sample, GL);
|
||||
|
|
|
|||
|
|
@ -51,8 +51,8 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
||||
public boolean GENOTYPE = false;
|
||||
|
||||
@Argument(fullName = "verbose", shortName = "v", doc = "EXPERIMENTAL", required = false)
|
||||
public boolean VERBOSE = false;
|
||||
@Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||
public String VERBOSE = null;
|
||||
|
||||
|
||||
// control the error modes
|
||||
|
|
@ -75,8 +75,4 @@ public class UnifiedArgumentCollection {
|
|||
|
||||
@Argument(fullName = "min_allele_frequency", shortName = "minFreq", doc = "The minimum possible allele frequency in a population (advanced)", required = false)
|
||||
public double MINIMUM_ALLELE_FREQUENCY = 1e-8;
|
||||
|
||||
//@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system. This argument is for testing purposes only", required = false)
|
||||
//public boolean disableCache = false;
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -254,10 +254,11 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
|
|||
return sum + 1;
|
||||
}
|
||||
|
||||
/** Close the variant file. */
|
||||
// Close any file writers
|
||||
public void onTraversalDone(Integer sum) {
|
||||
logger.info("Processed " + sum + " loci that are callable for SNPs");
|
||||
writer.close();
|
||||
gcm.close();
|
||||
logger.info("Processed " + sum + " loci that are callable for SNPs");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue