I had to remove some of my more agressive optimizations, as they were causing us to get slightly different results as MSG. Results in only small cost to running time.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1856 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-10-16 00:59:32 +00:00
parent 7d7ff09f54
commit 993c567bd8
3 changed files with 23 additions and 46 deletions

View File

@ -11,7 +11,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
protected AllMAFsGenotypeCalculationModel() {} protected AllMAFsGenotypeCalculationModel() {}
protected double[] initializeAlleleFrequencies(int numSamples, int[] baseCounts) { protected double[] initializeAlleleFrequencies(int numSamples, char ref) {
double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles

View File

@ -11,10 +11,10 @@ import java.util.*;
public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel { public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel {
// We need to set a limit on the EM iterations in case something flukey goes on // We need to set a limit on the EM iterations in case something flukey goes on
protected static final int MAX_EM_ITERATIONS = 6; protected static final int MAX_EM_ITERATIONS = 8;
// We consider the EM stable when the MAF doesn't change more than 1/10N // We consider the EM stable when the MAF doesn't change more than 1/1000
protected static final double EM_STABILITY_METRIC = 0.1; protected static final double EM_STABILITY_METRIC = 1e-3;
protected EMGenotypeCalculationModel() {} protected EMGenotypeCalculationModel() {}
@ -22,23 +22,18 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
// keep track of the context for each sample, overall and separated by strand // keep track of the context for each sample, overall and separated by strand
int[] baseCounts = new int[4]; HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, baseCounts);
if ( contexts == null ) if ( contexts == null )
return null; return null;
// debugging output
for (int i = 0; i < 4; i++)
logger.debug("Base count for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i]);
// run the EM calculation // run the EM calculation
EMOutput overall = runEM(ref, contexts, priors, baseCounts, StratifiedContext.OVERALL); EMOutput overall = runEM(ref, contexts, priors, StratifiedContext.OVERALL);
double lod = overall.getPofD() - overall.getPofNull(); double lod = overall.getPofD() - overall.getPofNull();
logger.debug("lod=" + lod); logger.debug("lod=" + lod);
// calculate strand score // calculate strand score
EMOutput forward = runEM(ref, contexts, priors, baseCounts, StratifiedContext.FORWARD); EMOutput forward = runEM(ref, contexts, priors, StratifiedContext.FORWARD);
EMOutput reverse = runEM(ref, contexts, priors, baseCounts, StratifiedContext.REVERSE); EMOutput reverse = runEM(ref, contexts, priors, StratifiedContext.REVERSE);
double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull(); double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull(); double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
@ -54,11 +49,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
protected List<GenotypeCall> genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, HashMap<String, AlignmentContextBySample> contexts) { protected List<GenotypeCall> genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, HashMap<String, AlignmentContextBySample> contexts) {
HashMap<String, GenotypeLikelihoods> GLs = results.getGenotypeLikelihoods(); HashMap<String, GenotypeLikelihoods> GLs = results.getGenotypeLikelihoods();
// an optimization
double expectedChromosomes = 2.0 * (double)GLs.size() * results.getMAF();
if ( expectedChromosomes < 1.0 )
return new ArrayList<GenotypeCall>();
ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>(); ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
int variantCalls = 0; int variantCalls = 0;
@ -67,7 +57,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL); AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
ReadBackedPileup pileup = new ReadBackedPileup(ref, context); ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
pileup.setIncludeDeletionsInPileupString(true); pileup.setIncludeDeletionsInPileupString(true);
// create the call // create the call
GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup); GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
calls.add(call); calls.add(call);
@ -79,15 +68,15 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// if everyone is ref, don't emit any calls // if everyone is ref, don't emit any calls
if ( variantCalls == 0 ) if ( variantCalls == 0 )
calls = null; calls.clear();
return calls; return calls;
} }
public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, int[] baseCounts, StratifiedContext contextType) { public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
// get initial allele frequencies // get initial allele frequencies
double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), baseCounts); double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), ref);
for (int i = 0; i < alleleFrequencies.length; i++) for (int i = 0; i < alleleFrequencies.length; i++)
logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]); logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]);
@ -119,7 +108,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
return computePofF(ref, GLs, alleleFrequencies, contexts.size()); return computePofF(ref, GLs, alleleFrequencies, contexts.size());
} }
protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts); protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, char ref);
protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType); protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType);
protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs); protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs);
protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies); protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies);
@ -127,15 +116,12 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) { protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) {
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)numSamplesInContext);
// determine whether we're stable
double AF_delta = 0.0; double AF_delta = 0.0;
for (int i = 0; i < oldAlleleFrequencies.length; i++) for (int i = 0; i < oldAlleleFrequencies.length; i++)
AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]); AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]);
return (AF_delta < EM_STABILITY_VALUE); return (AF_delta < EM_STABILITY_METRIC);
} }
protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) { protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
@ -161,11 +147,9 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
* Create the mapping from sample to alignment context; also, fill in the base counts along the way. * Create the mapping from sample to alignment context; also, fill in the base counts along the way.
* Returns null iff there are too many deletions at this position. * Returns null iff there are too many deletions at this position.
*/ */
protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context, int[] baseCounts) { protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context) {
HashMap<String, AlignmentContextBySample> contexts = new HashMap<String, AlignmentContextBySample>(); HashMap<String, AlignmentContextBySample> contexts = new HashMap<String, AlignmentContextBySample>();
for (int i = 0; i < 4; i++)
baseCounts[i] = 0;
int deletionsInPile = 0; int deletionsInPile = 0;
List<SAMRecord> reads = context.getReads(); List<SAMRecord> reads = context.getReads();
@ -197,14 +181,10 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// are there too many deletions in the pileup? // are there too many deletions in the pileup?
if ( ++deletionsInPile > maxDeletionsInPileup ) if ( ++deletionsInPile > maxDeletionsInPileup )
return null; return null;
} else {
// add bad bases to the context (for DoC calculations), but don't count them
int baseIndex = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset));
if ( baseIndex != -1 )
baseCounts[baseIndex]++;
} }
// add the read to this sample's context // add the read to this sample's context
// note that bad bases are added to the context (for DoC calculations later)
myContext.add(read, offset); myContext.add(read, offset);
} }

View File

@ -19,7 +19,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
if ( samples.size() == 1 ) { if ( samples.size() == 1 ) {
// split the context (so we can get forward/reverse contexts for free) // split the context (so we can get forward/reverse contexts for free)
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, new int[4]); HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
if ( contexts == null ) if ( contexts == null )
return null; return null;
@ -53,16 +53,13 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL); return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
} }
protected double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts) { protected double[] initializeAlleleFrequencies(int numSamplesInContext, char ref) {
// An intelligent guess would be the observed base ratios // Initialization values from Jared's original MSG code
// (plus some small number to account for sampling issues). double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity)
int totalObservedBases = 0; double REF = 0.9994999; //sqrt(1-heterozygosity)
for (int i = 0; i < 4; i++)
totalObservedBases += baseCounts[i];
double[] alleleFrequencies = new double[4]; double[] alleleFrequencies = {NON_REF, NON_REF, NON_REF, NON_REF};
for (int i = 0; i < 4; i++) alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF;
alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
return alleleFrequencies; return alleleFrequencies;
} }