Refactored joint estimation model to allow subclasses to overload PofD calculation over all frequencies. Pooled model now takes only 20% of time that it used to.
Added integration test for pooled model and updated other joint estimation tests to be more comprehensive now that they are faster. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2123 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7f947f6b60
commit
70059a0fc9
|
|
@ -16,14 +16,6 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
|
|||
|
||||
private enum GenotypeType { REF, HET, HOM }
|
||||
|
||||
protected HashMap<String, AlignmentContextBySample> createContexts(AlignmentContext context) {
|
||||
return splitContextBySample(context);
|
||||
}
|
||||
|
||||
protected int getNSamples(HashMap<String, AlignmentContextBySample> contexts) {
|
||||
return contexts.size();
|
||||
}
|
||||
|
||||
protected void initialize(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
// initialize the GenotypeLikelihoods
|
||||
GLs.clear();
|
||||
|
|
@ -63,17 +55,22 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
|
|||
}
|
||||
}
|
||||
|
||||
protected double computeLog10PofDgivenAFi(char ref, char alt, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
|
||||
// *** note that this code assumes that allele frequencies are passed in IN ORDER from 0 to 2N
|
||||
protected void calculatelog10PofDgivenAFforAllF(char ref, char alt, int numFrequencies, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
|
||||
AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt);
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt);
|
||||
|
||||
// for any frequency other than zero, calculate the next greedy entry
|
||||
if ( MathUtils.compareDoubles(f, 0.0) != 0 )
|
||||
// first, calculate for AF=0 (no change to matrix)
|
||||
log10PofDgivenAFi[baseIndex][0] = matrix.getLikelihoodsOfFrequency();
|
||||
|
||||
// for each minor allele frequency, calculate log10PofDgivenAFi
|
||||
for (int i = 1; i <= numFrequencies; i++) {
|
||||
// add one more alternatr allele
|
||||
matrix.incrementFrequency();
|
||||
|
||||
return matrix.getLikelihoodsOfFrequency();
|
||||
|
||||
// calculate new likelihoods
|
||||
log10PofDgivenAFi[baseIndex][i] = matrix.getLikelihoodsOfFrequency();
|
||||
}
|
||||
}
|
||||
|
||||
protected List<Genotype> makeGenotypeCalls(char ref, char alt, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
|
||||
|
|
|
|||
|
|
@ -28,15 +28,6 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
|||
protected double[] PofFs = new double[BaseUtils.BASES.length];
|
||||
|
||||
|
||||
// The abstract methods which need to be implemented:
|
||||
// 1. How many samples are being evaluated?
|
||||
protected abstract int getNSamples(HashMap<String, AlignmentContextBySample> contexts);
|
||||
// 2. Create stratified contexts from the original alignment context
|
||||
protected abstract HashMap<String, AlignmentContextBySample> createContexts(AlignmentContext context);
|
||||
// 3. The likelihoods calculation underlying this whole model
|
||||
protected abstract double computeLog10PofDgivenAFi(char ref, char alt, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType);
|
||||
|
||||
|
||||
protected JointEstimateGenotypeCalculationModel() {}
|
||||
|
||||
public Pair<List<Genotype>, GenotypeLocusData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
|
||||
|
|
@ -62,6 +53,14 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
|||
return createCalls(tracker, ref, contexts, context.getLocation(), frequencyEstimationPoints);
|
||||
}
|
||||
|
||||
protected HashMap<String, AlignmentContextBySample> createContexts(AlignmentContext context) {
|
||||
return splitContextBySample(context);
|
||||
}
|
||||
|
||||
protected int getNSamples(HashMap<String, AlignmentContextBySample> contexts) {
|
||||
return contexts.size();
|
||||
}
|
||||
|
||||
protected void initializeVerboseWriter(PrintWriter verboseWriter) {
|
||||
StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t");
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
|
|
@ -128,16 +127,52 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
|||
if ( altAllele == ref )
|
||||
continue;
|
||||
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
|
||||
// for each minor allele frequency
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++) {
|
||||
double f = (double)i / (double)(frequencyEstimationPoints-1);
|
||||
log10PofDgivenAFi[baseIndex][i] += computeLog10PofDgivenAFi(ref, altAllele, f, contexts, contextType);
|
||||
}
|
||||
calculatelog10PofDgivenAFforAllF(ref, altAllele, frequencyEstimationPoints-1, contexts, contextType);
|
||||
}
|
||||
}
|
||||
|
||||
/********************************************************************************/
|
||||
/*** One or both of the following methods should be overloaded in subclasses ***/
|
||||
/*** so that the correct calculation is made for PofDgivenAFi ***/
|
||||
/********************************************************************************/
|
||||
|
||||
/**
|
||||
* @param ref the ref base
|
||||
* @param alt the alt base
|
||||
* @param numFrequencies total number of allele frequencies (2N)
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType which stratification to use
|
||||
*/
|
||||
protected void calculatelog10PofDgivenAFforAllF(char ref, char alt, int numFrequencies, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt);
|
||||
|
||||
// for each minor allele frequency, calculate log10PofDgivenAFi
|
||||
for (int i = 0; i <= numFrequencies; i++) {
|
||||
double f = (double)i / (double)(numFrequencies);
|
||||
log10PofDgivenAFi[baseIndex][i] += calculateLog10PofDgivenAFforF(ref, alt, f, contexts, contextType);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param ref the ref base
|
||||
* @param alt the alt base
|
||||
* @param f the allele frequency
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType which stratification to use
|
||||
*
|
||||
* @return value of PofDgivenAF for allele frequency f
|
||||
*/
|
||||
protected double calculateLog10PofDgivenAFforF(char ref, char alt, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/********************************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* @param ref the ref base
|
||||
* @param frequencyEstimationPoints number of allele frequencies
|
||||
*/
|
||||
protected void calculatePofFs(char ref, int frequencyEstimationPoints) {
|
||||
// for each alternate allele
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
|
|
|
|||
|
|
@ -72,17 +72,13 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode
|
|||
return contexts;
|
||||
}
|
||||
|
||||
protected double computeLog10PofDgivenAFi(char refArg, char altArg, double f, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
protected void calculatelog10PofDgivenAFforAllF(char ref, char alt, int nChromosomes, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
|
||||
AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType));
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
|
||||
|
||||
double log10L = 0.0;
|
||||
|
||||
int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg);
|
||||
int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg);
|
||||
|
||||
int nChromosomes = 2 * getNSamples(contexts);
|
||||
int nAltAlleles = (int)(f * nChromosomes);
|
||||
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||
int altIndex = BaseUtils.simpleBaseToBaseIndex(alt);
|
||||
|
||||
for (int i = 0; i < pileup.getReads().size(); i++) {
|
||||
int offset = pileup.getOffsets().get(i);
|
||||
|
|
@ -97,11 +93,12 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode
|
|||
byte qual = read.getBaseQualities()[offset];
|
||||
|
||||
if ( qual > 0 && bIndex != -1 ) {
|
||||
log10L += calcPBGivenH(refIndex, altIndex, nAltAlleles, nChromosomes, base, qual, read, offset);
|
||||
|
||||
// for each minor allele frequency, calculate log10PofDgivenAFi
|
||||
for (int frequency = 0; frequency <= nChromosomes; frequency++)
|
||||
log10PofDgivenAFi[altIndex][frequency] += calcPBGivenH(refIndex, altIndex, frequency, nChromosomes, base, qual, read, offset);
|
||||
}
|
||||
}
|
||||
|
||||
return log10L;
|
||||
}
|
||||
|
||||
// cache = BMM x PL x REF x ALT x base x QUAL x strand x F as i
|
||||
|
|
|
|||
|
|
@ -59,6 +59,19 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing pooled model
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testPooled1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
|
||||
Arrays.asList("80fcd214108b4f4145ae585e5f119460"));
|
||||
executeTest("testPooled1", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing joint estimation model
|
||||
|
|
@ -67,24 +80,24 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testMultiSamplePilot1Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("aa4a5c8ae775f8e3484d50424900370e"));
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("e1a206a49982f7db5c3f4b65aa910b3a"));
|
||||
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("c262ec6b341c854a24d082cee2c8e318"));
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("cec6081d89ada9b2924fe38c7a021921"));
|
||||
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,067,000-10,083,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("368217e47a34bce2dac89a9786675104"));
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("0cd57b2b6272202db0eca45376fdb01d"));
|
||||
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue