I had to remove some of my more agressive optimizations, as they were causing us to get slightly different results as MSG. Results in only small cost to running time.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1856 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7d7ff09f54
commit
993c567bd8
|
|
@ -11,7 +11,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
|
|||
|
||||
protected AllMAFsGenotypeCalculationModel() {}
|
||||
|
||||
protected double[] initializeAlleleFrequencies(int numSamples, int[] baseCounts) {
|
||||
protected double[] initializeAlleleFrequencies(int numSamples, char ref) {
|
||||
double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -11,10 +11,10 @@ import java.util.*;
|
|||
public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel {
|
||||
|
||||
// We need to set a limit on the EM iterations in case something flukey goes on
|
||||
protected static final int MAX_EM_ITERATIONS = 6;
|
||||
protected static final int MAX_EM_ITERATIONS = 8;
|
||||
|
||||
// We consider the EM stable when the MAF doesn't change more than 1/10N
|
||||
protected static final double EM_STABILITY_METRIC = 0.1;
|
||||
// We consider the EM stable when the MAF doesn't change more than 1/1000
|
||||
protected static final double EM_STABILITY_METRIC = 1e-3;
|
||||
|
||||
|
||||
protected EMGenotypeCalculationModel() {}
|
||||
|
|
@ -22,23 +22,18 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
|
||||
|
||||
// keep track of the context for each sample, overall and separated by strand
|
||||
int[] baseCounts = new int[4];
|
||||
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, baseCounts);
|
||||
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
|
||||
if ( contexts == null )
|
||||
return null;
|
||||
|
||||
// debugging output
|
||||
for (int i = 0; i < 4; i++)
|
||||
logger.debug("Base count for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i]);
|
||||
|
||||
// run the EM calculation
|
||||
EMOutput overall = runEM(ref, contexts, priors, baseCounts, StratifiedContext.OVERALL);
|
||||
EMOutput overall = runEM(ref, contexts, priors, StratifiedContext.OVERALL);
|
||||
double lod = overall.getPofD() - overall.getPofNull();
|
||||
logger.debug("lod=" + lod);
|
||||
|
||||
// calculate strand score
|
||||
EMOutput forward = runEM(ref, contexts, priors, baseCounts, StratifiedContext.FORWARD);
|
||||
EMOutput reverse = runEM(ref, contexts, priors, baseCounts, StratifiedContext.REVERSE);
|
||||
EMOutput forward = runEM(ref, contexts, priors, StratifiedContext.FORWARD);
|
||||
EMOutput reverse = runEM(ref, contexts, priors, StratifiedContext.REVERSE);
|
||||
double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
|
||||
double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
|
||||
logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
|
||||
|
|
@ -54,11 +49,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
protected List<GenotypeCall> genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, HashMap<String, AlignmentContextBySample> contexts) {
|
||||
HashMap<String, GenotypeLikelihoods> GLs = results.getGenotypeLikelihoods();
|
||||
|
||||
// an optimization
|
||||
double expectedChromosomes = 2.0 * (double)GLs.size() * results.getMAF();
|
||||
if ( expectedChromosomes < 1.0 )
|
||||
return new ArrayList<GenotypeCall>();
|
||||
|
||||
ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
|
||||
int variantCalls = 0;
|
||||
|
||||
|
|
@ -67,7 +57,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
pileup.setIncludeDeletionsInPileupString(true);
|
||||
|
||||
// create the call
|
||||
GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
|
||||
calls.add(call);
|
||||
|
|
@ -79,15 +68,15 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
|
||||
// if everyone is ref, don't emit any calls
|
||||
if ( variantCalls == 0 )
|
||||
calls = null;
|
||||
calls.clear();
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, int[] baseCounts, StratifiedContext contextType) {
|
||||
public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
|
||||
// get initial allele frequencies
|
||||
double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), baseCounts);
|
||||
double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), ref);
|
||||
for (int i = 0; i < alleleFrequencies.length; i++)
|
||||
logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]);
|
||||
|
||||
|
|
@ -119,7 +108,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
return computePofF(ref, GLs, alleleFrequencies, contexts.size());
|
||||
}
|
||||
|
||||
protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts);
|
||||
protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, char ref);
|
||||
protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType);
|
||||
protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs);
|
||||
protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies);
|
||||
|
|
@ -127,15 +116,12 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
|
||||
|
||||
protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) {
|
||||
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N
|
||||
double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)numSamplesInContext);
|
||||
|
||||
// determine whether we're stable
|
||||
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
|
||||
double AF_delta = 0.0;
|
||||
for (int i = 0; i < oldAlleleFrequencies.length; i++)
|
||||
AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]);
|
||||
|
||||
return (AF_delta < EM_STABILITY_VALUE);
|
||||
return (AF_delta < EM_STABILITY_METRIC);
|
||||
}
|
||||
|
||||
protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
|
||||
|
|
@ -161,11 +147,9 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
* Create the mapping from sample to alignment context; also, fill in the base counts along the way.
|
||||
* Returns null iff there are too many deletions at this position.
|
||||
*/
|
||||
protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context, int[] baseCounts) {
|
||||
protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context) {
|
||||
|
||||
HashMap<String, AlignmentContextBySample> contexts = new HashMap<String, AlignmentContextBySample>();
|
||||
for (int i = 0; i < 4; i++)
|
||||
baseCounts[i] = 0;
|
||||
|
||||
int deletionsInPile = 0;
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
|
|
@ -197,14 +181,10 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
// are there too many deletions in the pileup?
|
||||
if ( ++deletionsInPile > maxDeletionsInPileup )
|
||||
return null;
|
||||
} else {
|
||||
// add bad bases to the context (for DoC calculations), but don't count them
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset));
|
||||
if ( baseIndex != -1 )
|
||||
baseCounts[baseIndex]++;
|
||||
}
|
||||
|
||||
// add the read to this sample's context
|
||||
// note that bad bases are added to the context (for DoC calculations later)
|
||||
myContext.add(read, offset);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
if ( samples.size() == 1 ) {
|
||||
|
||||
// split the context (so we can get forward/reverse contexts for free)
|
||||
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, new int[4]);
|
||||
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
|
||||
if ( contexts == null )
|
||||
return null;
|
||||
|
||||
|
|
@ -53,16 +53,13 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
|
||||
}
|
||||
|
||||
protected double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts) {
|
||||
// An intelligent guess would be the observed base ratios
|
||||
// (plus some small number to account for sampling issues).
|
||||
int totalObservedBases = 0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
totalObservedBases += baseCounts[i];
|
||||
protected double[] initializeAlleleFrequencies(int numSamplesInContext, char ref) {
|
||||
// Initialization values from Jared's original MSG code
|
||||
double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity)
|
||||
double REF = 0.9994999; //sqrt(1-heterozygosity)
|
||||
|
||||
double[] alleleFrequencies = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
|
||||
double[] alleleFrequencies = {NON_REF, NON_REF, NON_REF, NON_REF};
|
||||
alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF;
|
||||
return alleleFrequencies;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue