Optimizations:
1. Only do calculations in UG for alternate allele with highest sum of quality scores (note that this also constitutes a bug fix for a precision problem we were having). 2. Avoid using Strings in DiploidGenotype when we can (it was taking 1.5% of my compute according to JProfiler) UG now runs in half the time for JOINT_ESTIMATE model. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2141 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a59e5b5e1a
commit
b3f561710f
|
|
@ -9,6 +9,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel {
|
public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel {
|
||||||
|
|
||||||
// because the null allele frequencies are constant for a given N,
|
// because the null allele frequencies are constant for a given N,
|
||||||
|
|
@ -17,7 +19,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
|
|
||||||
// because the Hardy-Weinberg values for a given frequency are constant,
|
// because the Hardy-Weinberg values for a given frequency are constant,
|
||||||
// we cache the results to avoid having to recompute everything
|
// we cache the results to avoid having to recompute everything
|
||||||
private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();
|
// private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();
|
||||||
|
|
||||||
// the allele frequency priors
|
// the allele frequency priors
|
||||||
protected double[] log10AlleleFrequencyPriors;
|
protected double[] log10AlleleFrequencyPriors;
|
||||||
|
|
@ -27,6 +29,9 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
protected double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][];
|
protected double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][];
|
||||||
protected double[] PofFs = new double[BaseUtils.BASES.length];
|
protected double[] PofFs = new double[BaseUtils.BASES.length];
|
||||||
|
|
||||||
|
// the alternate allele with the largest sum of quality scores
|
||||||
|
protected Character bestAlternateAllele = null;
|
||||||
|
|
||||||
|
|
||||||
protected JointEstimateGenotypeCalculationModel() {}
|
protected JointEstimateGenotypeCalculationModel() {}
|
||||||
|
|
||||||
|
|
@ -40,6 +45,12 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
int numSamples = getNSamples(contexts);
|
int numSamples = getNSamples(contexts);
|
||||||
int frequencyEstimationPoints = (2 * numSamples) + 1; // (add 1 for allele frequency of zero)
|
int frequencyEstimationPoints = (2 * numSamples) + 1; // (add 1 for allele frequency of zero)
|
||||||
|
|
||||||
|
// find the alternate allele with the largest sum of quality scores
|
||||||
|
initializeBestAlternateAllele(ref, context);
|
||||||
|
// if there are no non-ref bases and we don't want all bases, then we can just return
|
||||||
|
if ( !ALL_BASE_MODE && bestAlternateAllele == null )
|
||||||
|
return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
|
||||||
|
|
||||||
initializeAlleleFrequencies(frequencyEstimationPoints);
|
initializeAlleleFrequencies(frequencyEstimationPoints);
|
||||||
|
|
||||||
initialize(ref, contexts, StratifiedContext.OVERALL);
|
initialize(ref, contexts, StratifiedContext.OVERALL);
|
||||||
|
|
@ -61,6 +72,39 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
return contexts.size();
|
return contexts.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void initializeBestAlternateAllele(char ref, AlignmentContext context) {
|
||||||
|
int[] qualCounts = new int[4];
|
||||||
|
|
||||||
|
// calculate the sum of quality scores for each base
|
||||||
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
List<Integer> offsets = context.getOffsets();
|
||||||
|
for (int i = 0; i < reads.size(); i++) {
|
||||||
|
int offset = offsets.get(i);
|
||||||
|
// ignore deletions
|
||||||
|
if ( offset == -1 )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
SAMRecord read = reads.get(i);
|
||||||
|
char base = (char)read.getReadBases()[offset];
|
||||||
|
int index = BaseUtils.simpleBaseToBaseIndex(base);
|
||||||
|
if ( index >= 0 )
|
||||||
|
qualCounts[index] += read.getBaseQualities()[offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
// set the non-ref base with maximum quality score sum
|
||||||
|
int maxCount = 0;
|
||||||
|
bestAlternateAllele = null;
|
||||||
|
for ( char altAllele : BaseUtils.BASES ) {
|
||||||
|
if ( altAllele == ref )
|
||||||
|
continue;
|
||||||
|
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||||
|
if ( qualCounts[index] > maxCount ) {
|
||||||
|
maxCount = qualCounts[index];
|
||||||
|
bestAlternateAllele = altAllele;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
protected void initializeVerboseWriter(PrintWriter verboseWriter) {
|
protected void initializeVerboseWriter(PrintWriter verboseWriter) {
|
||||||
StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t");
|
StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t");
|
||||||
for ( char altAllele : BaseUtils.BASES ) {
|
for ( char altAllele : BaseUtils.BASES ) {
|
||||||
|
|
@ -122,13 +166,8 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints];
|
log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints];
|
||||||
}
|
}
|
||||||
|
|
||||||
// for each alternate allele
|
// only calculate for the most likely alternate allele
|
||||||
for ( char altAllele : BaseUtils.BASES ) {
|
calculatelog10PofDgivenAFforAllF(ref, bestAlternateAllele, frequencyEstimationPoints-1, contexts, contextType);
|
||||||
if ( altAllele == ref )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
calculatelog10PofDgivenAFforAllF(ref, altAllele, frequencyEstimationPoints-1, contexts, contextType);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/********************************************************************************/
|
/********************************************************************************/
|
||||||
|
|
@ -174,26 +213,22 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
* @param frequencyEstimationPoints number of allele frequencies
|
* @param frequencyEstimationPoints number of allele frequencies
|
||||||
*/
|
*/
|
||||||
protected void calculatePofFs(char ref, int frequencyEstimationPoints) {
|
protected void calculatePofFs(char ref, int frequencyEstimationPoints) {
|
||||||
// for each alternate allele
|
// only calculate for the most likely alternate allele
|
||||||
for ( char altAllele : BaseUtils.BASES ) {
|
int baseIndex = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
|
||||||
if ( altAllele == ref )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
// multiply by null allele frequency priors to get AF posteriors, then normalize
|
||||||
|
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||||
|
alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
|
||||||
|
alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);
|
||||||
|
|
||||||
// multiply by null allele frequency priors to get AF posteriors, then normalize
|
// calculate p(f>0)
|
||||||
for (int i = 0; i < frequencyEstimationPoints; i++)
|
double sum = 0.0;
|
||||||
alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
|
for (int i = 1; i < frequencyEstimationPoints; i++)
|
||||||
alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);
|
sum += alleleFrequencyPosteriors[baseIndex][i];
|
||||||
|
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
|
||||||
// calculate p(f>0)
|
|
||||||
double sum = 0.0;
|
|
||||||
for (int i = 1; i < frequencyEstimationPoints; i++)
|
|
||||||
sum += alleleFrequencyPosteriors[baseIndex][i];
|
|
||||||
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/***
|
||||||
protected double[] getHardyWeinbergValues(double f) {
|
protected double[] getHardyWeinbergValues(double f) {
|
||||||
double[] HWvalues = hardyWeinbergValueCache.get(f);
|
double[] HWvalues = hardyWeinbergValueCache.get(f);
|
||||||
|
|
||||||
|
|
@ -219,7 +254,13 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
|
|
||||||
return HWvalues;
|
return HWvalues;
|
||||||
}
|
}
|
||||||
|
***/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param ref the ref base
|
||||||
|
* @param loc the GenomeLoc
|
||||||
|
* @param frequencyEstimationPoints number of allele frequencies
|
||||||
|
*/
|
||||||
protected void printAlleleFrequencyData(char ref, GenomeLoc loc, int frequencyEstimationPoints) {
|
protected void printAlleleFrequencyData(char ref, GenomeLoc loc, int frequencyEstimationPoints) {
|
||||||
for (int i = 0; i < frequencyEstimationPoints; i++) {
|
for (int i = 0; i < frequencyEstimationPoints; i++) {
|
||||||
StringBuilder AFline = new StringBuilder("AFINFO\t");
|
StringBuilder AFline = new StringBuilder("AFINFO\t");
|
||||||
|
|
@ -256,21 +297,8 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
|
protected Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
|
||||||
// first, find the alt allele with maximum confidence
|
// only need to look at the most likely alternate allele
|
||||||
int indexOfMax = 0;
|
int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
|
||||||
char baseOfMax = ref;
|
|
||||||
double maxConfidence = Double.MIN_VALUE;
|
|
||||||
for ( char altAllele : BaseUtils.BASES ) {
|
|
||||||
if ( altAllele != ref ) {
|
|
||||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
|
||||||
if ( PofFs[baseIndex] > maxConfidence ||
|
|
||||||
(MathUtils.compareDoubles(PofFs[baseIndex], maxConfidence) == 0 && log10PofDgivenAFi[baseIndex][0] < log10PofDgivenAFi[indexOfMax][0]) ) {
|
|
||||||
indexOfMax = baseIndex;
|
|
||||||
baseOfMax = altAllele;
|
|
||||||
maxConfidence = PofFs[baseIndex];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
|
double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
|
||||||
if ( Double.isInfinite(phredScaledConfidence) )
|
if ( Double.isInfinite(phredScaledConfidence) )
|
||||||
|
|
@ -282,7 +310,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
|
return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
|
||||||
|
|
||||||
// populate the sample-specific data
|
// populate the sample-specific data
|
||||||
List<Genotype> calls = makeGenotypeCalls(ref, baseOfMax, contexts, loc);
|
List<Genotype> calls = makeGenotypeCalls(ref, bestAlternateAllele, contexts, loc);
|
||||||
|
|
||||||
// next, the general locus data
|
// next, the general locus data
|
||||||
// note that calculating strand bias involves overwriting data structures, so we do that last
|
// note that calculating strand bias involves overwriting data structures, so we do that last
|
||||||
|
|
@ -292,7 +320,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
|
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
|
||||||
}
|
}
|
||||||
if ( locusdata instanceof AlternateAlleleBacked ) {
|
if ( locusdata instanceof AlternateAlleleBacked ) {
|
||||||
((AlternateAlleleBacked)locusdata).setAlternateAllele(baseOfMax);
|
((AlternateAlleleBacked)locusdata).setAlternateAllele(bestAlternateAllele);
|
||||||
}
|
}
|
||||||
if ( locusdata instanceof AlleleFrequencyBacked ) {
|
if ( locusdata instanceof AlleleFrequencyBacked ) {
|
||||||
((AlleleFrequencyBacked)locusdata).setAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));
|
((AlleleFrequencyBacked)locusdata).setAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,14 @@ public enum DiploidGenotype {
|
||||||
* @return the diploid genotype
|
* @return the diploid genotype
|
||||||
*/
|
*/
|
||||||
public static DiploidGenotype createHomGenotype(char hom) {
|
public static DiploidGenotype createHomGenotype(char hom) {
|
||||||
return DiploidGenotype.valueOf((String.valueOf(hom) + String.valueOf(hom)).toUpperCase());
|
hom = Character.toUpperCase(hom);
|
||||||
|
switch (hom) {
|
||||||
|
case 'A': return DiploidGenotype.AA;
|
||||||
|
case 'C': return DiploidGenotype.CC;
|
||||||
|
case 'G': return DiploidGenotype.GG;
|
||||||
|
case 'T': return DiploidGenotype.TT;
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException(hom + " is not a valid base character");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -89,7 +89,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testMultiSamplePilot2Joint() {
|
public void testMultiSamplePilot2Joint() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||||
Arrays.asList("cec6081d89ada9b2924fe38c7a021921"));
|
Arrays.asList("7eb10aec21497da727eeb67bb5c5a743"));
|
||||||
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue