Refactored joint estimation model to allow subclasses to overload PofD calculation over all frequencies. Pooled model now takes only 20% of time that it used to.

Added integration test for pooled model and updated other joint estimation tests to be more comprehensive now that they are faster.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2123 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-11-23 20:03:38 +00:00
parent 7f947f6b60
commit 70059a0fc9
4 changed files with 91 additions and 49 deletions

View File

@ -16,14 +16,6 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
private enum GenotypeType { REF, HET, HOM }
protected HashMap<String, AlignmentContextBySample> createContexts(AlignmentContext context) {
return splitContextBySample(context);
}
protected int getNSamples(HashMap<String, AlignmentContextBySample> contexts) {
return contexts.size();
}
protected void initialize(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
// initialize the GenotypeLikelihoods
GLs.clear();
@ -63,17 +55,22 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
}
}
protected double computeLog10PofDgivenAFi(char ref, char alt, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
// *** note that this code assumes that allele frequencies are passed in IN ORDER from 0 to 2N
protected void calculatelog10PofDgivenAFforAllF(char ref, char alt, int numFrequencies, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt);
int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt);
// for any frequency other than zero, calculate the next greedy entry
if ( MathUtils.compareDoubles(f, 0.0) != 0 )
// first, calculate for AF=0 (no change to matrix)
log10PofDgivenAFi[baseIndex][0] = matrix.getLikelihoodsOfFrequency();
// for each minor allele frequency, calculate log10PofDgivenAFi
for (int i = 1; i <= numFrequencies; i++) {
// add one more alternatr allele
matrix.incrementFrequency();
return matrix.getLikelihoodsOfFrequency();
// calculate new likelihoods
log10PofDgivenAFi[baseIndex][i] = matrix.getLikelihoodsOfFrequency();
}
}
protected List<Genotype> makeGenotypeCalls(char ref, char alt, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {

View File

@ -28,15 +28,6 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
protected double[] PofFs = new double[BaseUtils.BASES.length];
// The abstract methods which need to be implemented:
// 1. How many samples are being evaluated?
protected abstract int getNSamples(HashMap<String, AlignmentContextBySample> contexts);
// 2. Create stratified contexts from the original alignment context
protected abstract HashMap<String, AlignmentContextBySample> createContexts(AlignmentContext context);
// 3. The likelihoods calculation underlying this whole model
protected abstract double computeLog10PofDgivenAFi(char ref, char alt, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType);
protected JointEstimateGenotypeCalculationModel() {}
public Pair<List<Genotype>, GenotypeLocusData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
@ -62,6 +53,14 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
return createCalls(tracker, ref, contexts, context.getLocation(), frequencyEstimationPoints);
}
protected HashMap<String, AlignmentContextBySample> createContexts(AlignmentContext context) {
return splitContextBySample(context);
}
protected int getNSamples(HashMap<String, AlignmentContextBySample> contexts) {
return contexts.size();
}
protected void initializeVerboseWriter(PrintWriter verboseWriter) {
StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t");
for ( char altAllele : BaseUtils.BASES ) {
@ -128,16 +127,52 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
if ( altAllele == ref )
continue;
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
// for each minor allele frequency
for (int i = 0; i < frequencyEstimationPoints; i++) {
double f = (double)i / (double)(frequencyEstimationPoints-1);
log10PofDgivenAFi[baseIndex][i] += computeLog10PofDgivenAFi(ref, altAllele, f, contexts, contextType);
}
calculatelog10PofDgivenAFforAllF(ref, altAllele, frequencyEstimationPoints-1, contexts, contextType);
}
}
/********************************************************************************/
/*** One or both of the following methods should be overloaded in subclasses ***/
/*** so that the correct calculation is made for PofDgivenAFi ***/
/********************************************************************************/
/**
* @param ref the ref base
* @param alt the alt base
* @param numFrequencies total number of allele frequencies (2N)
* @param contexts stratified alignment contexts
* @param contextType which stratification to use
*/
protected void calculatelog10PofDgivenAFforAllF(char ref, char alt, int numFrequencies, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt);
// for each minor allele frequency, calculate log10PofDgivenAFi
for (int i = 0; i <= numFrequencies; i++) {
double f = (double)i / (double)(numFrequencies);
log10PofDgivenAFi[baseIndex][i] += calculateLog10PofDgivenAFforF(ref, alt, f, contexts, contextType);
}
}
/**
* @param ref the ref base
* @param alt the alt base
* @param f the allele frequency
* @param contexts stratified alignment contexts
* @param contextType which stratification to use
*
* @return value of PofDgivenAF for allele frequency f
*/
protected double calculateLog10PofDgivenAFforF(char ref, char alt, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
return 0.0;
}
/********************************************************************************/
/**
* @param ref the ref base
* @param frequencyEstimationPoints number of allele frequencies
*/
protected void calculatePofFs(char ref, int frequencyEstimationPoints) {
// for each alternate allele
for ( char altAllele : BaseUtils.BASES ) {

View File

@ -72,17 +72,13 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode
return contexts;
}
protected double computeLog10PofDgivenAFi(char refArg, char altArg, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
protected void calculatelog10PofDgivenAFforAllF(char ref, char alt, int nChromosomes, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME);
ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType));
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
double log10L = 0.0;
int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg);
int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg);
int nChromosomes = 2 * getNSamples(contexts);
int nAltAlleles = (int)(f * nChromosomes);
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref);
int altIndex = BaseUtils.simpleBaseToBaseIndex(alt);
for (int i = 0; i < pileup.getReads().size(); i++) {
int offset = pileup.getOffsets().get(i);
@ -97,11 +93,12 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode
byte qual = read.getBaseQualities()[offset];
if ( qual > 0 && bIndex != -1 ) {
log10L += calcPBGivenH(refIndex, altIndex, nAltAlleles, nChromosomes, base, qual, read, offset);
// for each minor allele frequency, calculate log10PofDgivenAFi
for (int frequency = 0; frequency <= nChromosomes; frequency++)
log10PofDgivenAFi[altIndex][frequency] += calcPBGivenH(refIndex, altIndex, frequency, nChromosomes, base, qual, read, offset);
}
}
return log10L;
}
// cache = BMM x PL x REF x ALT x base x QUAL x strand x F as i

View File

@ -59,6 +59,19 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing pooled model
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testPooled1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
Arrays.asList("80fcd214108b4f4145ae585e5f119460"));
executeTest("testPooled1", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing joint estimation model
@ -67,24 +80,24 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testMultiSamplePilot1Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("aa4a5c8ae775f8e3484d50424900370e"));
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("e1a206a49982f7db5c3f4b65aa910b3a"));
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
}
@Test
public void testMultiSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("c262ec6b341c854a24d082cee2c8e318"));
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("cec6081d89ada9b2924fe38c7a021921"));
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
}
@Test
public void testSingleSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,067,000-10,083,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("368217e47a34bce2dac89a9786675104"));
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("0cd57b2b6272202db0eca45376fdb01d"));
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
}