-push allele frequency and genotype likelihood variable definitions down into the subclasses so that they can use different data structures

-use slightly more stringent stability metric
-better integration test



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1869 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-10-18 04:22:17 +00:00
parent d6385e0d88
commit b9e8867287
4 changed files with 130 additions and 90 deletions

View File

@ -8,30 +8,62 @@ import java.util.*;
public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel {
protected AllMAFsGenotypeCalculationModel() {}
protected double[] initializeAlleleFrequencies(int numSamples, char ref) {
double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles
private double[] alleleFrequencies;
return alleleFrequencies;
protected void initializeAlleleFrequencies(int numSamples, char ref) {
// we have 2N possible allele frequencies in pileup
int possibleMAFs = 2 * numSamples;
alleleFrequencies = new double[possibleMAFs];
// calculate sum(1/i) for i from 1 to 2N
double denominator = 0.0;
for (int i = 1; i <= possibleMAFs; i++)
denominator += 1.0 / (double)i;
// set up delta
double delta = 1.0 / denominator;
// calculate the null allele frequencies
for (int i = 1; i <= possibleMAFs; i++)
alleleFrequencies[i-1] = Math.log10(delta / (double)i);
for (int i = 0; i < possibleMAFs; i++)
logger.debug("Initial allele frequency for MAF=" + (i+1) + ": " + alleleFrequencies[i]);
}
protected HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) {
return null;
protected void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
for ( String sample : contexts.keySet() ) {
AlignmentContextBySample context = contexts.get(sample);
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
// create the GenotypeLikelihoods object
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
GL.setVerbose(VERBOSE);
GL.add(pileup, true);
GLs.put(sample, GL);
}
}
protected double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs) {
return null;
}
protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies) {
protected void calculateAlleleFrequencyPosteriors() {
}
protected EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext) {
protected void applyAlleleFrequencyToGenotypeLikelihoods() {
}
protected boolean isStable() {
return true;
}
protected EMOutput computePofF(char ref) {
return null;
}
}

View File

@ -13,8 +13,8 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// We need to set a limit on the EM iterations in case something flukey goes on
protected static final int MAX_EM_ITERATIONS = 8;
// We consider the EM stable when the MAF doesn't change more than 1/1000
protected static final double EM_STABILITY_METRIC = 1e-3;
// We consider the EM stable when the MAF doesn't change more than 1/10,000
protected static final double EM_STABILITY_METRIC = 1e-4;
protected EMGenotypeCalculationModel() {}
@ -75,13 +75,11 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
// get initial allele frequencies
double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), ref);
for (int i = 0; i < alleleFrequencies.length; i++)
logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]);
// initialize the allele frequencies
initializeAlleleFrequencies(contexts.size(), ref);
// get the initial genotype likelihoods
HashMap<String, GenotypeLikelihoods> GLs = initializeGenotypeLikelihoods(ref, contexts, alleleFrequencies, priors, contextType);
// initialize the genotype likelihoods
initializeGenotypeLikelihoods(ref, contexts, priors, contextType);
// The EM loop:
// we want to continue until the calculation is stable, but we need some max on the number of iterations
@ -89,59 +87,25 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
boolean EM_IS_STABLE;
do {
double[] newAlleleFrequencies = calculateAlleleFrequencyPosteriors(GLs);
for (int i = 0; i < alleleFrequencies.length; i++)
logger.debug("New allele frequency for i=" + i + ": " + newAlleleFrequencies[i]);
calculateAlleleFrequencyPosteriors();
applyAlleleFrequencyToGenotypeLikelihoods(GLs, newAlleleFrequencies);
applyAlleleFrequencyToGenotypeLikelihoods();
EM_IS_STABLE = isStable(alleleFrequencies, newAlleleFrequencies, contexts.size());
alleleFrequencies = newAlleleFrequencies;
EM_IS_STABLE = isStable();
} while ( ++iterations < MAX_EM_ITERATIONS && !EM_IS_STABLE );
logger.debug("EM loop took " + iterations + " iterations");
for ( String sample : GLs.keySet() )
logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString());
return computePofF(ref, GLs, alleleFrequencies, contexts.size());
return computePofF(ref);
}
protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, char ref);
protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType);
protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs);
protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies);
protected abstract EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext);
protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) {
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
double AF_delta = 0.0;
for (int i = 0; i < oldAlleleFrequencies.length; i++)
AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]);
return (AF_delta < EM_STABILITY_METRIC);
}
protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
// convert to log-space
double[] log10Freqs = new double[4];
for (int i = 0; i < 4; i++)
log10Freqs[i] = Math.log10(alleleFrequencies[i]);
double[] alleleFreqPriors = new double[10];
// this is the Hardy-Weinberg based allele frequency (p^2, q^2, 2pq)
for ( DiploidGenotype g : DiploidGenotype.values() ) {
alleleFreqPriors[g.ordinal()] = log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base1)] + log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base2)];
// add a factor of 2 for the 2pq case
if ( g.isHet() )
alleleFreqPriors[g.ordinal()] += Math.log10(2);
}
return new DiploidGenotypePriors(alleleFreqPriors);
}
protected abstract void initializeAlleleFrequencies(int numSamplesInContext, char ref);
protected abstract void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType);
protected abstract void calculateAlleleFrequencyPosteriors();
protected abstract void applyAlleleFrequencyToGenotypeLikelihoods();
protected abstract boolean isStable();
protected abstract EMOutput computePofF(char ref);
/**
* Create the mapping from sample to alignment context; also, fill in the base counts along the way.

View File

@ -12,6 +12,18 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
protected PointEstimateGenotypeCalculationModel() {}
// the allele frequencies
private double[] alleleFrequencies = new double[4];
private double[] oldAlleleFrequencies;
// the GenotypeLikelihoods map
private HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
// Allele frequency initialization values from the original MSG code (so we can be consistent)
private static final double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity)
private static final double REF = 0.9994999; //sqrt(1-heterozygosity)
// overload this method so we can special-case the single sample
public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
@ -53,18 +65,17 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
}
protected double[] initializeAlleleFrequencies(int numSamplesInContext, char ref) {
// Initialization values from Jared's original MSG code
double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity)
double REF = 0.9994999; //sqrt(1-heterozygosity)
double[] alleleFrequencies = {NON_REF, NON_REF, NON_REF, NON_REF};
protected void initializeAlleleFrequencies(int numSamplesInContext, char ref) {
for (int i = 0; i < 4; i++)
alleleFrequencies[i] = NON_REF;
alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF;
return alleleFrequencies;
for (int i = 0; i < 4; i++)
logger.debug("Initial allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i]);
}
protected HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) {
HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
protected void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
GLs.clear();
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
@ -79,12 +90,32 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
GLs.put(sample, GL);
}
return GLs;
}
protected double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs) {
double[] newAlleleLikelihoods = new double[4];
private static DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFreqs) {
// convert to log-space
double[] log10Freqs = new double[4];
for (int i = 0; i < 4; i++)
log10Freqs[i] = Math.log10(alleleFreqs[i]);
double[] alleleFreqPriors = new double[10];
// this is the Hardy-Weinberg based allele frequency (p^2, q^2, 2pq)
for ( DiploidGenotype g : DiploidGenotype.values() ) {
alleleFreqPriors[g.ordinal()] = log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base1)] + log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base2)];
// add a factor of 2 for the 2pq case
if ( g.isHet() )
alleleFreqPriors[g.ordinal()] += Math.log10(2);
}
return new DiploidGenotypePriors(alleleFreqPriors);
}
protected void calculateAlleleFrequencyPosteriors() {
// initialization
oldAlleleFrequencies = alleleFrequencies.clone();
for (int i = 0; i < 4; i++)
alleleFrequencies[i] = 0.0;
for ( GenotypeLikelihoods GL : GLs.values() ) {
double[] normalizedPosteriors = GL.getNormalizedPosteriors();
@ -98,26 +129,39 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
}
for (int i = 0; i < 4; i++)
newAlleleLikelihoods[i] += personalAllelePosteriors[i];
alleleFrequencies[i] += personalAllelePosteriors[i];
}
// normalize
double sum = 0;
for (int i = 0; i < 4; i++)
sum += newAlleleLikelihoods[i];
sum += alleleFrequencies[i];
for (int i = 0; i < 4; i++)
newAlleleLikelihoods[i] /= sum;
alleleFrequencies[i] /= sum;
return newAlleleLikelihoods;
for (int i = 0; i < 4; i++)
logger.debug("New allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i]);
}
protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies) {
protected void applyAlleleFrequencyToGenotypeLikelihoods() {
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
for ( GenotypeLikelihoods GL : GLs.values() )
GL.setPriors(AFPriors);
}
protected EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext) {
protected boolean isStable() {
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
double AF_delta = 0.0;
for (int i = 0; i < 4; i++)
AF_delta += Math.abs(oldAlleleFrequencies[i] - alleleFrequencies[i]);
return (AF_delta < EM_STABILITY_METRIC);
}
protected EMOutput computePofF(char ref) {
// some debugging output
for ( String sample : GLs.keySet() )
logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString());
// compute pD and pNull without allele frequencies
double pD = compute_pD(GLs);
@ -126,7 +170,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
// compute p0
double pVar = 0.0;
for (int i = 1; i < numSamplesInContext; i++)
for (int i = 1; i < GLs.size(); i++)
pVar += heterozygosity/(double)i;
double p0 = Math.log10(1.0 - pVar);
@ -140,7 +184,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
// compute pF
double pF;
double expectedChromosomes = 2.0 * (double)numSamplesInContext * MAF;
double expectedChromosomes = 2.0 * (double)GLs.size() * MAF;
if ( expectedChromosomes < 1.0 )
pF = p0;
else
@ -154,7 +198,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
return new EMOutput(pD, pNull, pF, MAF, GLs);
}
private double compute_pD(HashMap<String, GenotypeLikelihoods> GLs) {
private static double compute_pD(HashMap<String, GenotypeLikelihoods> GLs) {
double pD = 0.0;
for ( GenotypeLikelihoods GL : GLs.values() ) {
double sum = 0.0;
@ -166,7 +210,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
return pD;
}
private double compute_pNull(char ref, HashMap<String, GenotypeLikelihoods> GLs) {
private static double compute_pNull(char ref, HashMap<String, GenotypeLikelihoods> GLs) {
// compute null likelihoods
double[] alleleLikelihoods = new double[4];
for (int i = 0; i < 4; i++)

View File

@ -43,15 +43,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
Arrays.asList("4b8d13283157735957d60cdfa492ab90"));
Arrays.asList("2da83b62c35cf32211ef475fd89c252e"));
executeTest("testMultiSamplePilot1", spec);
}
@Test
public void testMultiSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
Arrays.asList("d41d8cd98f00b204e9800998ecf8427e"));
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
Arrays.asList("8cde0c2aa4d9ad51c603db4b64f3933f"));
executeTest("testMultiSamplePilot2", spec);
}