diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java index 5ee1962e1..784b49dea 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java @@ -8,30 +8,62 @@ import java.util.*; public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel { - protected AllMAFsGenotypeCalculationModel() {} - protected double[] initializeAlleleFrequencies(int numSamples, char ref) { - double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles + private double[] alleleFrequencies; - return alleleFrequencies; + protected void initializeAlleleFrequencies(int numSamples, char ref) { + // we have 2N possible allele frequencies in pileup + int possibleMAFs = 2 * numSamples; + alleleFrequencies = new double[possibleMAFs]; + + // calculate sum(1/i) for i from 1 to 2N + double denominator = 0.0; + for (int i = 1; i <= possibleMAFs; i++) + denominator += 1.0 / (double)i; + + // set up delta + double delta = 1.0 / denominator; + + // calculate the null allele frequencies + for (int i = 1; i <= possibleMAFs; i++) + alleleFrequencies[i-1] = Math.log10(delta / (double)i); + + for (int i = 0; i < possibleMAFs; i++) + logger.debug("Initial allele frequency for MAF=" + (i+1) + ": " + alleleFrequencies[i]); } - protected HashMap initializeGenotypeLikelihoods(char ref, HashMap contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) { - return null; + protected void initializeGenotypeLikelihoods(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { + HashMap GLs = new HashMap(); + + for ( String sample : contexts.keySet() ) { + AlignmentContextBySample context = contexts.get(sample); + ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType)); + + // create the GenotypeLikelihoods object + GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); + GL.setVerbose(VERBOSE); + GL.add(pileup, true); + + GLs.put(sample, GL); + } } - protected double[] calculateAlleleFrequencyPosteriors(HashMap GLs) { - return null; - } - - protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap GLs, double[] alleleFrequencies) { + protected void calculateAlleleFrequencyPosteriors() { } - protected EMOutput computePofF(char ref, HashMap GLs, double[] alleleFrequencies, int numSamplesInContext) { + protected void applyAlleleFrequencyToGenotypeLikelihoods() { + + } + + protected boolean isStable() { + return true; + } + + protected EMOutput computePofF(char ref) { return null; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index d036425ef..b65636b02 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -13,8 +13,8 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode // We need to set a limit on the EM iterations in case something flukey goes on protected static final int MAX_EM_ITERATIONS = 8; - // We consider the EM stable when the MAF doesn't change more than 1/1000 - protected static final double EM_STABILITY_METRIC = 1e-3; + // We consider the EM stable when the MAF doesn't change more than 1/10,000 + protected static final double EM_STABILITY_METRIC = 1e-4; protected EMGenotypeCalculationModel() {} @@ -75,13 +75,11 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode public EMOutput runEM(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { - // get initial allele frequencies - double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), ref); - for (int i = 0; i < alleleFrequencies.length; i++) - logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]); + // initialize the allele frequencies + initializeAlleleFrequencies(contexts.size(), ref); - // get the initial genotype likelihoods - HashMap GLs = initializeGenotypeLikelihoods(ref, contexts, alleleFrequencies, priors, contextType); + // initialize the genotype likelihoods + initializeGenotypeLikelihoods(ref, contexts, priors, contextType); // The EM loop: // we want to continue until the calculation is stable, but we need some max on the number of iterations @@ -89,59 +87,25 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode boolean EM_IS_STABLE; do { - double[] newAlleleFrequencies = calculateAlleleFrequencyPosteriors(GLs); - for (int i = 0; i < alleleFrequencies.length; i++) - logger.debug("New allele frequency for i=" + i + ": " + newAlleleFrequencies[i]); + calculateAlleleFrequencyPosteriors(); - applyAlleleFrequencyToGenotypeLikelihoods(GLs, newAlleleFrequencies); + applyAlleleFrequencyToGenotypeLikelihoods(); - EM_IS_STABLE = isStable(alleleFrequencies, newAlleleFrequencies, contexts.size()); - - alleleFrequencies = newAlleleFrequencies; + EM_IS_STABLE = isStable(); } while ( ++iterations < MAX_EM_ITERATIONS && !EM_IS_STABLE ); logger.debug("EM loop took " + iterations + " iterations"); - for ( String sample : GLs.keySet() ) - logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString()); - return computePofF(ref, GLs, alleleFrequencies, contexts.size()); + return computePofF(ref); } - protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, char ref); - protected abstract HashMap initializeGenotypeLikelihoods(char ref, HashMap contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType); - protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap GLs); - protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap GLs, double[] alleleFrequencies); - protected abstract EMOutput computePofF(char ref, HashMap GLs, double[] alleleFrequencies, int numSamplesInContext); - - - protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) { - // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC - double AF_delta = 0.0; - for (int i = 0; i < oldAlleleFrequencies.length; i++) - AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]); - - return (AF_delta < EM_STABILITY_METRIC); - } - - protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) { - // convert to log-space - double[] log10Freqs = new double[4]; - for (int i = 0; i < 4; i++) - log10Freqs[i] = Math.log10(alleleFrequencies[i]); - - double[] alleleFreqPriors = new double[10]; - - // this is the Hardy-Weinberg based allele frequency (p^2, q^2, 2pq) - for ( DiploidGenotype g : DiploidGenotype.values() ) { - alleleFreqPriors[g.ordinal()] = log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base1)] + log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base2)]; - // add a factor of 2 for the 2pq case - if ( g.isHet() ) - alleleFreqPriors[g.ordinal()] += Math.log10(2); - } - - return new DiploidGenotypePriors(alleleFreqPriors); - } + protected abstract void initializeAlleleFrequencies(int numSamplesInContext, char ref); + protected abstract void initializeGenotypeLikelihoods(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType); + protected abstract void calculateAlleleFrequencyPosteriors(); + protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(); + protected abstract boolean isStable(); + protected abstract EMOutput computePofF(char ref); /** * Create the mapping from sample to alignment context; also, fill in the base counts along the way. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index f92ef57e2..ece842be8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -12,6 +12,18 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation protected PointEstimateGenotypeCalculationModel() {} + // the allele frequencies + private double[] alleleFrequencies = new double[4]; + private double[] oldAlleleFrequencies; + + // the GenotypeLikelihoods map + private HashMap GLs = new HashMap(); + + // Allele frequency initialization values from the original MSG code (so we can be consistent) + private static final double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity) + private static final double REF = 0.9994999; //sqrt(1-heterozygosity) + + // overload this method so we can special-case the single sample public Pair, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { @@ -53,18 +65,17 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation return new Pair(pileup, GL); } - protected double[] initializeAlleleFrequencies(int numSamplesInContext, char ref) { - // Initialization values from Jared's original MSG code - double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity) - double REF = 0.9994999; //sqrt(1-heterozygosity) - - double[] alleleFrequencies = {NON_REF, NON_REF, NON_REF, NON_REF}; + protected void initializeAlleleFrequencies(int numSamplesInContext, char ref) { + for (int i = 0; i < 4; i++) + alleleFrequencies[i] = NON_REF; alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF; - return alleleFrequencies; + + for (int i = 0; i < 4; i++) + logger.debug("Initial allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i]); } - protected HashMap initializeGenotypeLikelihoods(char ref, HashMap contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) { - HashMap GLs = new HashMap(); + protected void initializeGenotypeLikelihoods(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { + GLs.clear(); DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies); @@ -79,12 +90,32 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation GLs.put(sample, GL); } - - return GLs; } - protected double[] calculateAlleleFrequencyPosteriors(HashMap GLs) { - double[] newAlleleLikelihoods = new double[4]; + private static DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFreqs) { + // convert to log-space + double[] log10Freqs = new double[4]; + for (int i = 0; i < 4; i++) + log10Freqs[i] = Math.log10(alleleFreqs[i]); + + double[] alleleFreqPriors = new double[10]; + + // this is the Hardy-Weinberg based allele frequency (p^2, q^2, 2pq) + for ( DiploidGenotype g : DiploidGenotype.values() ) { + alleleFreqPriors[g.ordinal()] = log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base1)] + log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base2)]; + // add a factor of 2 for the 2pq case + if ( g.isHet() ) + alleleFreqPriors[g.ordinal()] += Math.log10(2); + } + + return new DiploidGenotypePriors(alleleFreqPriors); + } + + protected void calculateAlleleFrequencyPosteriors() { + // initialization + oldAlleleFrequencies = alleleFrequencies.clone(); + for (int i = 0; i < 4; i++) + alleleFrequencies[i] = 0.0; for ( GenotypeLikelihoods GL : GLs.values() ) { double[] normalizedPosteriors = GL.getNormalizedPosteriors(); @@ -98,26 +129,39 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation } for (int i = 0; i < 4; i++) - newAlleleLikelihoods[i] += personalAllelePosteriors[i]; + alleleFrequencies[i] += personalAllelePosteriors[i]; } // normalize double sum = 0; for (int i = 0; i < 4; i++) - sum += newAlleleLikelihoods[i]; + sum += alleleFrequencies[i]; for (int i = 0; i < 4; i++) - newAlleleLikelihoods[i] /= sum; + alleleFrequencies[i] /= sum; - return newAlleleLikelihoods; + for (int i = 0; i < 4; i++) + logger.debug("New allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i]); } - protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap GLs, double[] alleleFrequencies) { + protected void applyAlleleFrequencyToGenotypeLikelihoods() { DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies); for ( GenotypeLikelihoods GL : GLs.values() ) GL.setPriors(AFPriors); } - protected EMOutput computePofF(char ref, HashMap GLs, double[] alleleFrequencies, int numSamplesInContext) { + protected boolean isStable() { + // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC + double AF_delta = 0.0; + for (int i = 0; i < 4; i++) + AF_delta += Math.abs(oldAlleleFrequencies[i] - alleleFrequencies[i]); + + return (AF_delta < EM_STABILITY_METRIC); + } + + protected EMOutput computePofF(char ref) { + // some debugging output + for ( String sample : GLs.keySet() ) + logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString()); // compute pD and pNull without allele frequencies double pD = compute_pD(GLs); @@ -126,7 +170,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation // compute p0 double pVar = 0.0; - for (int i = 1; i < numSamplesInContext; i++) + for (int i = 1; i < GLs.size(); i++) pVar += heterozygosity/(double)i; double p0 = Math.log10(1.0 - pVar); @@ -140,7 +184,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation // compute pF double pF; - double expectedChromosomes = 2.0 * (double)numSamplesInContext * MAF; + double expectedChromosomes = 2.0 * (double)GLs.size() * MAF; if ( expectedChromosomes < 1.0 ) pF = p0; else @@ -154,7 +198,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation return new EMOutput(pD, pNull, pF, MAF, GLs); } - private double compute_pD(HashMap GLs) { + private static double compute_pD(HashMap GLs) { double pD = 0.0; for ( GenotypeLikelihoods GL : GLs.values() ) { double sum = 0.0; @@ -166,7 +210,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation return pD; } - private double compute_pNull(char ref, HashMap GLs) { + private static double compute_pNull(char ref, HashMap GLs) { // compute null likelihoods double[] alleleLikelihoods = new double[4]; for (int i = 0; i < 4; i++) diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index f6263b688..bdfbbebee 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -43,15 +43,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1, - Arrays.asList("4b8d13283157735957d60cdfa492ab90")); + Arrays.asList("2da83b62c35cf32211ef475fd89c252e")); executeTest("testMultiSamplePilot1", spec); } @Test public void testMultiSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1, - Arrays.asList("d41d8cd98f00b204e9800998ecf8427e")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1, + Arrays.asList("8cde0c2aa4d9ad51c603db4b64f3933f")); executeTest("testMultiSamplePilot2", spec); }