I had to remove some of my more agressive optimizations, as they were causing us to get slightly different results as MSG. Results in only small cost to running time.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1856 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-10-16 00:59:32 +00:00
parent 7d7ff09f54
commit 993c567bd8
3 changed files with 23 additions and 46 deletions

View File

@ -11,7 +11,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
protected AllMAFsGenotypeCalculationModel() {}
protected double[] initializeAlleleFrequencies(int numSamples, int[] baseCounts) {
protected double[] initializeAlleleFrequencies(int numSamples, char ref) {
double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles

View File

@ -11,10 +11,10 @@ import java.util.*;
public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel {
// We need to set a limit on the EM iterations in case something flukey goes on
protected static final int MAX_EM_ITERATIONS = 6;
protected static final int MAX_EM_ITERATIONS = 8;
// We consider the EM stable when the MAF doesn't change more than 1/10N
protected static final double EM_STABILITY_METRIC = 0.1;
// We consider the EM stable when the MAF doesn't change more than 1/1000
protected static final double EM_STABILITY_METRIC = 1e-3;
protected EMGenotypeCalculationModel() {}
@ -22,23 +22,18 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
// keep track of the context for each sample, overall and separated by strand
int[] baseCounts = new int[4];
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, baseCounts);
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
if ( contexts == null )
return null;
// debugging output
for (int i = 0; i < 4; i++)
logger.debug("Base count for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i]);
// run the EM calculation
EMOutput overall = runEM(ref, contexts, priors, baseCounts, StratifiedContext.OVERALL);
EMOutput overall = runEM(ref, contexts, priors, StratifiedContext.OVERALL);
double lod = overall.getPofD() - overall.getPofNull();
logger.debug("lod=" + lod);
// calculate strand score
EMOutput forward = runEM(ref, contexts, priors, baseCounts, StratifiedContext.FORWARD);
EMOutput reverse = runEM(ref, contexts, priors, baseCounts, StratifiedContext.REVERSE);
EMOutput forward = runEM(ref, contexts, priors, StratifiedContext.FORWARD);
EMOutput reverse = runEM(ref, contexts, priors, StratifiedContext.REVERSE);
double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
@ -54,11 +49,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
protected List<GenotypeCall> genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, HashMap<String, AlignmentContextBySample> contexts) {
HashMap<String, GenotypeLikelihoods> GLs = results.getGenotypeLikelihoods();
// an optimization
double expectedChromosomes = 2.0 * (double)GLs.size() * results.getMAF();
if ( expectedChromosomes < 1.0 )
return new ArrayList<GenotypeCall>();
ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
int variantCalls = 0;
@ -67,7 +57,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
pileup.setIncludeDeletionsInPileupString(true);
// create the call
GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
calls.add(call);
@ -79,15 +68,15 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// if everyone is ref, don't emit any calls
if ( variantCalls == 0 )
calls = null;
calls.clear();
return calls;
}
public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, int[] baseCounts, StratifiedContext contextType) {
public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
// get initial allele frequencies
double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), baseCounts);
double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), ref);
for (int i = 0; i < alleleFrequencies.length; i++)
logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]);
@ -119,7 +108,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
return computePofF(ref, GLs, alleleFrequencies, contexts.size());
}
protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts);
protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, char ref);
protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType);
protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs);
protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies);
@ -127,15 +116,12 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) {
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N
double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)numSamplesInContext);
// determine whether we're stable
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
double AF_delta = 0.0;
for (int i = 0; i < oldAlleleFrequencies.length; i++)
AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]);
return (AF_delta < EM_STABILITY_VALUE);
return (AF_delta < EM_STABILITY_METRIC);
}
protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
@ -161,11 +147,9 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
* Create the mapping from sample to alignment context; also, fill in the base counts along the way.
* Returns null iff there are too many deletions at this position.
*/
protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context, int[] baseCounts) {
protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context) {
HashMap<String, AlignmentContextBySample> contexts = new HashMap<String, AlignmentContextBySample>();
for (int i = 0; i < 4; i++)
baseCounts[i] = 0;
int deletionsInPile = 0;
List<SAMRecord> reads = context.getReads();
@ -197,14 +181,10 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// are there too many deletions in the pileup?
if ( ++deletionsInPile > maxDeletionsInPileup )
return null;
} else {
// add bad bases to the context (for DoC calculations), but don't count them
int baseIndex = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset));
if ( baseIndex != -1 )
baseCounts[baseIndex]++;
}
// add the read to this sample's context
// note that bad bases are added to the context (for DoC calculations later)
myContext.add(read, offset);
}

View File

@ -19,7 +19,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
if ( samples.size() == 1 ) {
// split the context (so we can get forward/reverse contexts for free)
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, new int[4]);
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
if ( contexts == null )
return null;
@ -53,16 +53,13 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
}
protected double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts) {
// An intelligent guess would be the observed base ratios
// (plus some small number to account for sampling issues).
int totalObservedBases = 0;
for (int i = 0; i < 4; i++)
totalObservedBases += baseCounts[i];
protected double[] initializeAlleleFrequencies(int numSamplesInContext, char ref) {
// Initialization values from Jared's original MSG code
double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity)
double REF = 0.9994999; //sqrt(1-heterozygosity)
double[] alleleFrequencies = new double[4];
for (int i = 0; i < 4; i++)
alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
double[] alleleFrequencies = {NON_REF, NON_REF, NON_REF, NON_REF};
alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF;
return alleleFrequencies;
}