Optimizations:

1. Only do calculations in UG for alternate allele with highest sum of quality scores (note that this also constitutes a bug fix for a precision problem we were having).
2. Avoid using Strings in DiploidGenotype when we can (it was taking 1.5% of my compute according to JProfiler)

UG now runs in half the time for JOINT_ESTIMATE model.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2141 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-11-24 16:27:39 +00:00
parent a59e5b5e1a
commit b3f561710f
3 changed files with 78 additions and 43 deletions

View File

@ -9,6 +9,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import java.util.*; import java.util.*;
import java.io.PrintWriter; import java.io.PrintWriter;
import net.sf.samtools.SAMRecord;
public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel { public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel {
// because the null allele frequencies are constant for a given N, // because the null allele frequencies are constant for a given N,
@ -17,7 +19,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
// because the Hardy-Weinberg values for a given frequency are constant, // because the Hardy-Weinberg values for a given frequency are constant,
// we cache the results to avoid having to recompute everything // we cache the results to avoid having to recompute everything
private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>(); // private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();
// the allele frequency priors // the allele frequency priors
protected double[] log10AlleleFrequencyPriors; protected double[] log10AlleleFrequencyPriors;
@ -27,6 +29,9 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
protected double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][]; protected double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][];
protected double[] PofFs = new double[BaseUtils.BASES.length]; protected double[] PofFs = new double[BaseUtils.BASES.length];
// the alternate allele with the largest sum of quality scores
protected Character bestAlternateAllele = null;
protected JointEstimateGenotypeCalculationModel() {} protected JointEstimateGenotypeCalculationModel() {}
@ -40,6 +45,12 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
int numSamples = getNSamples(contexts); int numSamples = getNSamples(contexts);
int frequencyEstimationPoints = (2 * numSamples) + 1; // (add 1 for allele frequency of zero) int frequencyEstimationPoints = (2 * numSamples) + 1; // (add 1 for allele frequency of zero)
// find the alternate allele with the largest sum of quality scores
initializeBestAlternateAllele(ref, context);
// if there are no non-ref bases and we don't want all bases, then we can just return
if ( !ALL_BASE_MODE && bestAlternateAllele == null )
return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
initializeAlleleFrequencies(frequencyEstimationPoints); initializeAlleleFrequencies(frequencyEstimationPoints);
initialize(ref, contexts, StratifiedContext.OVERALL); initialize(ref, contexts, StratifiedContext.OVERALL);
@ -61,6 +72,39 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
return contexts.size(); return contexts.size();
} }
protected void initializeBestAlternateAllele(char ref, AlignmentContext context) {
int[] qualCounts = new int[4];
// calculate the sum of quality scores for each base
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i = 0; i < reads.size(); i++) {
int offset = offsets.get(i);
// ignore deletions
if ( offset == -1 )
continue;
SAMRecord read = reads.get(i);
char base = (char)read.getReadBases()[offset];
int index = BaseUtils.simpleBaseToBaseIndex(base);
if ( index >= 0 )
qualCounts[index] += read.getBaseQualities()[offset];
}
// set the non-ref base with maximum quality score sum
int maxCount = 0;
bestAlternateAllele = null;
for ( char altAllele : BaseUtils.BASES ) {
if ( altAllele == ref )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( qualCounts[index] > maxCount ) {
maxCount = qualCounts[index];
bestAlternateAllele = altAllele;
}
}
}
protected void initializeVerboseWriter(PrintWriter verboseWriter) { protected void initializeVerboseWriter(PrintWriter verboseWriter) {
StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t"); StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t");
for ( char altAllele : BaseUtils.BASES ) { for ( char altAllele : BaseUtils.BASES ) {
@ -122,13 +166,8 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints]; log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints];
} }
// for each alternate allele // only calculate for the most likely alternate allele
for ( char altAllele : BaseUtils.BASES ) { calculatelog10PofDgivenAFforAllF(ref, bestAlternateAllele, frequencyEstimationPoints-1, contexts, contextType);
if ( altAllele == ref )
continue;
calculatelog10PofDgivenAFforAllF(ref, altAllele, frequencyEstimationPoints-1, contexts, contextType);
}
} }
/********************************************************************************/ /********************************************************************************/
@ -174,26 +213,22 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
* @param frequencyEstimationPoints number of allele frequencies * @param frequencyEstimationPoints number of allele frequencies
*/ */
protected void calculatePofFs(char ref, int frequencyEstimationPoints) { protected void calculatePofFs(char ref, int frequencyEstimationPoints) {
// for each alternate allele // only calculate for the most likely alternate allele
for ( char altAllele : BaseUtils.BASES ) { int baseIndex = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
if ( altAllele == ref )
continue;
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); // multiply by null allele frequency priors to get AF posteriors, then normalize
for (int i = 0; i < frequencyEstimationPoints; i++)
alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);
// multiply by null allele frequency priors to get AF posteriors, then normalize // calculate p(f>0)
for (int i = 0; i < frequencyEstimationPoints; i++) double sum = 0.0;
alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i]; for (int i = 1; i < frequencyEstimationPoints; i++)
alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]); sum += alleleFrequencyPosteriors[baseIndex][i];
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
// calculate p(f>0)
double sum = 0.0;
for (int i = 1; i < frequencyEstimationPoints; i++)
sum += alleleFrequencyPosteriors[baseIndex][i];
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
}
} }
/***
protected double[] getHardyWeinbergValues(double f) { protected double[] getHardyWeinbergValues(double f) {
double[] HWvalues = hardyWeinbergValueCache.get(f); double[] HWvalues = hardyWeinbergValueCache.get(f);
@ -219,7 +254,13 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
return HWvalues; return HWvalues;
} }
***/
/**
* @param ref the ref base
* @param loc the GenomeLoc
* @param frequencyEstimationPoints number of allele frequencies
*/
protected void printAlleleFrequencyData(char ref, GenomeLoc loc, int frequencyEstimationPoints) { protected void printAlleleFrequencyData(char ref, GenomeLoc loc, int frequencyEstimationPoints) {
for (int i = 0; i < frequencyEstimationPoints; i++) { for (int i = 0; i < frequencyEstimationPoints; i++) {
StringBuilder AFline = new StringBuilder("AFINFO\t"); StringBuilder AFline = new StringBuilder("AFINFO\t");
@ -256,21 +297,8 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
} }
protected Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) { protected Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
// first, find the alt allele with maximum confidence // only need to look at the most likely alternate allele
int indexOfMax = 0; int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
char baseOfMax = ref;
double maxConfidence = Double.MIN_VALUE;
for ( char altAllele : BaseUtils.BASES ) {
if ( altAllele != ref ) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( PofFs[baseIndex] > maxConfidence ||
(MathUtils.compareDoubles(PofFs[baseIndex], maxConfidence) == 0 && log10PofDgivenAFi[baseIndex][0] < log10PofDgivenAFi[indexOfMax][0]) ) {
indexOfMax = baseIndex;
baseOfMax = altAllele;
maxConfidence = PofFs[baseIndex];
}
}
}
double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]); double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
if ( Double.isInfinite(phredScaledConfidence) ) if ( Double.isInfinite(phredScaledConfidence) )
@ -282,7 +310,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
return new Pair<List<Genotype>, GenotypeLocusData>(null, null); return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
// populate the sample-specific data // populate the sample-specific data
List<Genotype> calls = makeGenotypeCalls(ref, baseOfMax, contexts, loc); List<Genotype> calls = makeGenotypeCalls(ref, bestAlternateAllele, contexts, loc);
// next, the general locus data // next, the general locus data
// note that calculating strand bias involves overwriting data structures, so we do that last // note that calculating strand bias involves overwriting data structures, so we do that last
@ -292,7 +320,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence); ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
} }
if ( locusdata instanceof AlternateAlleleBacked ) { if ( locusdata instanceof AlternateAlleleBacked ) {
((AlternateAlleleBacked)locusdata).setAlternateAllele(baseOfMax); ((AlternateAlleleBacked)locusdata).setAlternateAllele(bestAlternateAllele);
} }
if ( locusdata instanceof AlleleFrequencyBacked ) { if ( locusdata instanceof AlleleFrequencyBacked ) {
((AlleleFrequencyBacked)locusdata).setAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1)); ((AlleleFrequencyBacked)locusdata).setAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));

View File

@ -59,7 +59,14 @@ public enum DiploidGenotype {
* @return the diploid genotype * @return the diploid genotype
*/ */
public static DiploidGenotype createHomGenotype(char hom) { public static DiploidGenotype createHomGenotype(char hom) {
return DiploidGenotype.valueOf((String.valueOf(hom) + String.valueOf(hom)).toUpperCase()); hom = Character.toUpperCase(hom);
switch (hom) {
case 'A': return DiploidGenotype.AA;
case 'C': return DiploidGenotype.CC;
case 'G': return DiploidGenotype.GG;
case 'T': return DiploidGenotype.TT;
}
throw new IllegalArgumentException(hom + " is not a valid base character");
} }
/** /**

View File

@ -89,7 +89,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2Joint() { public void testMultiSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("cec6081d89ada9b2924fe38c7a021921")); Arrays.asList("7eb10aec21497da727eeb67bb5c5a743"));
executeTest("testMultiSamplePilot2 - Joint Estimate", spec); executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
} }