From 993c567bd88cfe8bce589a8462fc349fa99c1938 Mon Sep 17 00:00:00 2001 From: ebanks Date: Fri, 16 Oct 2009 00:59:32 +0000 Subject: [PATCH] I had to remove some of my more agressive optimizations, as they were causing us to get slightly different results as MSG. Results in only small cost to running time. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1856 348d0f76-0448-11de-a6fe-93d51630548a --- .../AllMAFsGenotypeCalculationModel.java | 2 +- .../genotyper/EMGenotypeCalculationModel.java | 50 ++++++------------- ...PointEstimateGenotypeCalculationModel.java | 17 +++---- 3 files changed, 23 insertions(+), 46 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java index 3a9f943e2..5ee1962e1 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java @@ -11,7 +11,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel protected AllMAFsGenotypeCalculationModel() {} - protected double[] initializeAlleleFrequencies(int numSamples, int[] baseCounts) { + protected double[] initializeAlleleFrequencies(int numSamples, char ref) { double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index 9d22fc3e7..d036425ef 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -11,10 +11,10 @@ import java.util.*; public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel { // We need to set a limit on the EM iterations in case something flukey goes on - protected static final int MAX_EM_ITERATIONS = 6; + protected static final int MAX_EM_ITERATIONS = 8; - // We consider the EM stable when the MAF doesn't change more than 1/10N - protected static final double EM_STABILITY_METRIC = 0.1; + // We consider the EM stable when the MAF doesn't change more than 1/1000 + protected static final double EM_STABILITY_METRIC = 1e-3; protected EMGenotypeCalculationModel() {} @@ -22,23 +22,18 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode public Pair, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { // keep track of the context for each sample, overall and separated by strand - int[] baseCounts = new int[4]; - HashMap contexts = splitContextBySample(context, baseCounts); + HashMap contexts = splitContextBySample(context); if ( contexts == null ) return null; - // debugging output - for (int i = 0; i < 4; i++) - logger.debug("Base count for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i]); - // run the EM calculation - EMOutput overall = runEM(ref, contexts, priors, baseCounts, StratifiedContext.OVERALL); + EMOutput overall = runEM(ref, contexts, priors, StratifiedContext.OVERALL); double lod = overall.getPofD() - overall.getPofNull(); logger.debug("lod=" + lod); // calculate strand score - EMOutput forward = runEM(ref, contexts, priors, baseCounts, StratifiedContext.FORWARD); - EMOutput reverse = runEM(ref, contexts, priors, baseCounts, StratifiedContext.REVERSE); + EMOutput forward = runEM(ref, contexts, priors, StratifiedContext.FORWARD); + EMOutput reverse = runEM(ref, contexts, priors, StratifiedContext.REVERSE); double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull(); double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull(); logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); @@ -54,11 +49,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode protected List genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, HashMap contexts) { HashMap GLs = results.getGenotypeLikelihoods(); - // an optimization - double expectedChromosomes = 2.0 * (double)GLs.size() * results.getMAF(); - if ( expectedChromosomes < 1.0 ) - return new ArrayList(); - ArrayList calls = new ArrayList(); int variantCalls = 0; @@ -67,7 +57,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL); ReadBackedPileup pileup = new ReadBackedPileup(ref, context); pileup.setIncludeDeletionsInPileupString(true); - // create the call GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup); calls.add(call); @@ -79,15 +68,15 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode // if everyone is ref, don't emit any calls if ( variantCalls == 0 ) - calls = null; + calls.clear(); return calls; } - public EMOutput runEM(char ref, HashMap contexts, DiploidGenotypePriors priors, int[] baseCounts, StratifiedContext contextType) { + public EMOutput runEM(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { // get initial allele frequencies - double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), baseCounts); + double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), ref); for (int i = 0; i < alleleFrequencies.length; i++) logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]); @@ -119,7 +108,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode return computePofF(ref, GLs, alleleFrequencies, contexts.size()); } - protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts); + protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, char ref); protected abstract HashMap initializeGenotypeLikelihoods(char ref, HashMap contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType); protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap GLs); protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap GLs, double[] alleleFrequencies); @@ -127,15 +116,12 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) { - // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N - double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)numSamplesInContext); - - // determine whether we're stable + // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC double AF_delta = 0.0; for (int i = 0; i < oldAlleleFrequencies.length; i++) AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]); - return (AF_delta < EM_STABILITY_VALUE); + return (AF_delta < EM_STABILITY_METRIC); } protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) { @@ -161,11 +147,9 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode * Create the mapping from sample to alignment context; also, fill in the base counts along the way. * Returns null iff there are too many deletions at this position. */ - protected HashMap splitContextBySample(AlignmentContext context, int[] baseCounts) { + protected HashMap splitContextBySample(AlignmentContext context) { HashMap contexts = new HashMap(); - for (int i = 0; i < 4; i++) - baseCounts[i] = 0; int deletionsInPile = 0; List reads = context.getReads(); @@ -197,14 +181,10 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode // are there too many deletions in the pileup? if ( ++deletionsInPile > maxDeletionsInPileup ) return null; - } else { - // add bad bases to the context (for DoC calculations), but don't count them - int baseIndex = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset)); - if ( baseIndex != -1 ) - baseCounts[baseIndex]++; } // add the read to this sample's context + // note that bad bases are added to the context (for DoC calculations later) myContext.add(read, offset); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index fe3dee8c2..f92ef57e2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -19,7 +19,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation if ( samples.size() == 1 ) { // split the context (so we can get forward/reverse contexts for free) - HashMap contexts = splitContextBySample(context, new int[4]); + HashMap contexts = splitContextBySample(context); if ( contexts == null ) return null; @@ -53,16 +53,13 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation return new Pair(pileup, GL); } - protected double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts) { - // An intelligent guess would be the observed base ratios - // (plus some small number to account for sampling issues). - int totalObservedBases = 0; - for (int i = 0; i < 4; i++) - totalObservedBases += baseCounts[i]; + protected double[] initializeAlleleFrequencies(int numSamplesInContext, char ref) { + // Initialization values from Jared's original MSG code + double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity) + double REF = 0.9994999; //sqrt(1-heterozygosity) - double[] alleleFrequencies = new double[4]; - for (int i = 0; i < 4; i++) - alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases; + double[] alleleFrequencies = {NON_REF, NON_REF, NON_REF, NON_REF}; + alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF; return alleleFrequencies; }