-push allele frequency and genotype likelihood variable definitions down into the subclasses so that they can use different data structures
-use slightly more stringent stability metric -better integration test git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1869 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d6385e0d88
commit
b9e8867287
|
|
@ -8,30 +8,62 @@ import java.util.*;
|
|||
|
||||
public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel {
|
||||
|
||||
|
||||
protected AllMAFsGenotypeCalculationModel() {}
|
||||
|
||||
protected double[] initializeAlleleFrequencies(int numSamples, char ref) {
|
||||
double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles
|
||||
private double[] alleleFrequencies;
|
||||
|
||||
|
||||
|
||||
return alleleFrequencies;
|
||||
protected void initializeAlleleFrequencies(int numSamples, char ref) {
|
||||
// we have 2N possible allele frequencies in pileup
|
||||
int possibleMAFs = 2 * numSamples;
|
||||
alleleFrequencies = new double[possibleMAFs];
|
||||
|
||||
// calculate sum(1/i) for i from 1 to 2N
|
||||
double denominator = 0.0;
|
||||
for (int i = 1; i <= possibleMAFs; i++)
|
||||
denominator += 1.0 / (double)i;
|
||||
|
||||
// set up delta
|
||||
double delta = 1.0 / denominator;
|
||||
|
||||
// calculate the null allele frequencies
|
||||
for (int i = 1; i <= possibleMAFs; i++)
|
||||
alleleFrequencies[i-1] = Math.log10(delta / (double)i);
|
||||
|
||||
for (int i = 0; i < possibleMAFs; i++)
|
||||
logger.debug("Initial allele frequency for MAF=" + (i+1) + ": " + alleleFrequencies[i]);
|
||||
}
|
||||
|
||||
protected HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
return null;
|
||||
protected void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
||||
|
||||
for ( String sample : contexts.keySet() ) {
|
||||
AlignmentContextBySample context = contexts.get(sample);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
GL.setVerbose(VERBOSE);
|
||||
GL.add(pileup, true);
|
||||
|
||||
GLs.put(sample, GL);
|
||||
}
|
||||
}
|
||||
|
||||
protected double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
return null;
|
||||
}
|
||||
|
||||
protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies) {
|
||||
protected void calculateAlleleFrequencyPosteriors() {
|
||||
|
||||
}
|
||||
|
||||
protected EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext) {
|
||||
protected void applyAlleleFrequencyToGenotypeLikelihoods() {
|
||||
|
||||
}
|
||||
|
||||
protected boolean isStable() {
|
||||
return true;
|
||||
}
|
||||
|
||||
protected EMOutput computePofF(char ref) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -13,8 +13,8 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
// We need to set a limit on the EM iterations in case something flukey goes on
|
||||
protected static final int MAX_EM_ITERATIONS = 8;
|
||||
|
||||
// We consider the EM stable when the MAF doesn't change more than 1/1000
|
||||
protected static final double EM_STABILITY_METRIC = 1e-3;
|
||||
// We consider the EM stable when the MAF doesn't change more than 1/10,000
|
||||
protected static final double EM_STABILITY_METRIC = 1e-4;
|
||||
|
||||
|
||||
protected EMGenotypeCalculationModel() {}
|
||||
|
|
@ -75,13 +75,11 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
|
||||
public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
|
||||
// get initial allele frequencies
|
||||
double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), ref);
|
||||
for (int i = 0; i < alleleFrequencies.length; i++)
|
||||
logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]);
|
||||
// initialize the allele frequencies
|
||||
initializeAlleleFrequencies(contexts.size(), ref);
|
||||
|
||||
// get the initial genotype likelihoods
|
||||
HashMap<String, GenotypeLikelihoods> GLs = initializeGenotypeLikelihoods(ref, contexts, alleleFrequencies, priors, contextType);
|
||||
// initialize the genotype likelihoods
|
||||
initializeGenotypeLikelihoods(ref, contexts, priors, contextType);
|
||||
|
||||
// The EM loop:
|
||||
// we want to continue until the calculation is stable, but we need some max on the number of iterations
|
||||
|
|
@ -89,59 +87,25 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
boolean EM_IS_STABLE;
|
||||
|
||||
do {
|
||||
double[] newAlleleFrequencies = calculateAlleleFrequencyPosteriors(GLs);
|
||||
for (int i = 0; i < alleleFrequencies.length; i++)
|
||||
logger.debug("New allele frequency for i=" + i + ": " + newAlleleFrequencies[i]);
|
||||
calculateAlleleFrequencyPosteriors();
|
||||
|
||||
applyAlleleFrequencyToGenotypeLikelihoods(GLs, newAlleleFrequencies);
|
||||
applyAlleleFrequencyToGenotypeLikelihoods();
|
||||
|
||||
EM_IS_STABLE = isStable(alleleFrequencies, newAlleleFrequencies, contexts.size());
|
||||
|
||||
alleleFrequencies = newAlleleFrequencies;
|
||||
EM_IS_STABLE = isStable();
|
||||
|
||||
} while ( ++iterations < MAX_EM_ITERATIONS && !EM_IS_STABLE );
|
||||
|
||||
logger.debug("EM loop took " + iterations + " iterations");
|
||||
for ( String sample : GLs.keySet() )
|
||||
logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString());
|
||||
|
||||
return computePofF(ref, GLs, alleleFrequencies, contexts.size());
|
||||
return computePofF(ref);
|
||||
}
|
||||
|
||||
protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, char ref);
|
||||
protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType);
|
||||
protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs);
|
||||
protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies);
|
||||
protected abstract EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext);
|
||||
|
||||
|
||||
protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) {
|
||||
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
|
||||
double AF_delta = 0.0;
|
||||
for (int i = 0; i < oldAlleleFrequencies.length; i++)
|
||||
AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]);
|
||||
|
||||
return (AF_delta < EM_STABILITY_METRIC);
|
||||
}
|
||||
|
||||
protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
|
||||
// convert to log-space
|
||||
double[] log10Freqs = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
log10Freqs[i] = Math.log10(alleleFrequencies[i]);
|
||||
|
||||
double[] alleleFreqPriors = new double[10];
|
||||
|
||||
// this is the Hardy-Weinberg based allele frequency (p^2, q^2, 2pq)
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
alleleFreqPriors[g.ordinal()] = log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base1)] + log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base2)];
|
||||
// add a factor of 2 for the 2pq case
|
||||
if ( g.isHet() )
|
||||
alleleFreqPriors[g.ordinal()] += Math.log10(2);
|
||||
}
|
||||
|
||||
return new DiploidGenotypePriors(alleleFreqPriors);
|
||||
}
|
||||
protected abstract void initializeAlleleFrequencies(int numSamplesInContext, char ref);
|
||||
protected abstract void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType);
|
||||
protected abstract void calculateAlleleFrequencyPosteriors();
|
||||
protected abstract void applyAlleleFrequencyToGenotypeLikelihoods();
|
||||
protected abstract boolean isStable();
|
||||
protected abstract EMOutput computePofF(char ref);
|
||||
|
||||
/**
|
||||
* Create the mapping from sample to alignment context; also, fill in the base counts along the way.
|
||||
|
|
|
|||
|
|
@ -12,6 +12,18 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
|
||||
protected PointEstimateGenotypeCalculationModel() {}
|
||||
|
||||
// the allele frequencies
|
||||
private double[] alleleFrequencies = new double[4];
|
||||
private double[] oldAlleleFrequencies;
|
||||
|
||||
// the GenotypeLikelihoods map
|
||||
private HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
||||
|
||||
// Allele frequency initialization values from the original MSG code (so we can be consistent)
|
||||
private static final double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity)
|
||||
private static final double REF = 0.9994999; //sqrt(1-heterozygosity)
|
||||
|
||||
|
||||
// overload this method so we can special-case the single sample
|
||||
public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
|
||||
|
||||
|
|
@ -53,18 +65,17 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
|
||||
}
|
||||
|
||||
protected double[] initializeAlleleFrequencies(int numSamplesInContext, char ref) {
|
||||
// Initialization values from Jared's original MSG code
|
||||
double NON_REF = 0.0005002502; // heterozygosity / (2 * sqrt(1-heterozygosity)
|
||||
double REF = 0.9994999; //sqrt(1-heterozygosity)
|
||||
|
||||
double[] alleleFrequencies = {NON_REF, NON_REF, NON_REF, NON_REF};
|
||||
protected void initializeAlleleFrequencies(int numSamplesInContext, char ref) {
|
||||
for (int i = 0; i < 4; i++)
|
||||
alleleFrequencies[i] = NON_REF;
|
||||
alleleFrequencies[BaseUtils.simpleBaseToBaseIndex(ref)] = REF;
|
||||
return alleleFrequencies;
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
logger.debug("Initial allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i]);
|
||||
}
|
||||
|
||||
protected HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
||||
protected void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
GLs.clear();
|
||||
|
||||
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
|
||||
|
||||
|
|
@ -79,12 +90,32 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
|
||||
GLs.put(sample, GL);
|
||||
}
|
||||
|
||||
return GLs;
|
||||
}
|
||||
|
||||
protected double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
double[] newAlleleLikelihoods = new double[4];
|
||||
private static DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFreqs) {
|
||||
// convert to log-space
|
||||
double[] log10Freqs = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
log10Freqs[i] = Math.log10(alleleFreqs[i]);
|
||||
|
||||
double[] alleleFreqPriors = new double[10];
|
||||
|
||||
// this is the Hardy-Weinberg based allele frequency (p^2, q^2, 2pq)
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
alleleFreqPriors[g.ordinal()] = log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base1)] + log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base2)];
|
||||
// add a factor of 2 for the 2pq case
|
||||
if ( g.isHet() )
|
||||
alleleFreqPriors[g.ordinal()] += Math.log10(2);
|
||||
}
|
||||
|
||||
return new DiploidGenotypePriors(alleleFreqPriors);
|
||||
}
|
||||
|
||||
protected void calculateAlleleFrequencyPosteriors() {
|
||||
// initialization
|
||||
oldAlleleFrequencies = alleleFrequencies.clone();
|
||||
for (int i = 0; i < 4; i++)
|
||||
alleleFrequencies[i] = 0.0;
|
||||
|
||||
for ( GenotypeLikelihoods GL : GLs.values() ) {
|
||||
double[] normalizedPosteriors = GL.getNormalizedPosteriors();
|
||||
|
|
@ -98,26 +129,39 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
}
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
newAlleleLikelihoods[i] += personalAllelePosteriors[i];
|
||||
alleleFrequencies[i] += personalAllelePosteriors[i];
|
||||
}
|
||||
|
||||
// normalize
|
||||
double sum = 0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
sum += newAlleleLikelihoods[i];
|
||||
sum += alleleFrequencies[i];
|
||||
for (int i = 0; i < 4; i++)
|
||||
newAlleleLikelihoods[i] /= sum;
|
||||
alleleFrequencies[i] /= sum;
|
||||
|
||||
return newAlleleLikelihoods;
|
||||
for (int i = 0; i < 4; i++)
|
||||
logger.debug("New allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i]);
|
||||
}
|
||||
|
||||
protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies) {
|
||||
protected void applyAlleleFrequencyToGenotypeLikelihoods() {
|
||||
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
|
||||
for ( GenotypeLikelihoods GL : GLs.values() )
|
||||
GL.setPriors(AFPriors);
|
||||
}
|
||||
|
||||
protected EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext) {
|
||||
protected boolean isStable() {
|
||||
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
|
||||
double AF_delta = 0.0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
AF_delta += Math.abs(oldAlleleFrequencies[i] - alleleFrequencies[i]);
|
||||
|
||||
return (AF_delta < EM_STABILITY_METRIC);
|
||||
}
|
||||
|
||||
protected EMOutput computePofF(char ref) {
|
||||
// some debugging output
|
||||
for ( String sample : GLs.keySet() )
|
||||
logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString());
|
||||
|
||||
// compute pD and pNull without allele frequencies
|
||||
double pD = compute_pD(GLs);
|
||||
|
|
@ -126,7 +170,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
|
||||
// compute p0
|
||||
double pVar = 0.0;
|
||||
for (int i = 1; i < numSamplesInContext; i++)
|
||||
for (int i = 1; i < GLs.size(); i++)
|
||||
pVar += heterozygosity/(double)i;
|
||||
double p0 = Math.log10(1.0 - pVar);
|
||||
|
||||
|
|
@ -140,7 +184,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
|
||||
// compute pF
|
||||
double pF;
|
||||
double expectedChromosomes = 2.0 * (double)numSamplesInContext * MAF;
|
||||
double expectedChromosomes = 2.0 * (double)GLs.size() * MAF;
|
||||
if ( expectedChromosomes < 1.0 )
|
||||
pF = p0;
|
||||
else
|
||||
|
|
@ -154,7 +198,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
return new EMOutput(pD, pNull, pF, MAF, GLs);
|
||||
}
|
||||
|
||||
private double compute_pD(HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
private static double compute_pD(HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
double pD = 0.0;
|
||||
for ( GenotypeLikelihoods GL : GLs.values() ) {
|
||||
double sum = 0.0;
|
||||
|
|
@ -166,7 +210,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
return pD;
|
||||
}
|
||||
|
||||
private double compute_pNull(char ref, HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
private static double compute_pNull(char ref, HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
// compute null likelihoods
|
||||
double[] alleleLikelihoods = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
|
|
|
|||
|
|
@ -43,15 +43,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
|
||||
Arrays.asList("4b8d13283157735957d60cdfa492ab90"));
|
||||
Arrays.asList("2da83b62c35cf32211ef475fd89c252e"));
|
||||
executeTest("testMultiSamplePilot1", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
|
||||
Arrays.asList("d41d8cd98f00b204e9800998ecf8427e"));
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
|
||||
Arrays.asList("8cde0c2aa4d9ad51c603db4b64f3933f"));
|
||||
executeTest("testMultiSamplePilot2", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue