From 27122f7f97f76e6f03bfc14214e15b3a489aa12d Mon Sep 17 00:00:00 2001 From: depristo Date: Sat, 21 Nov 2009 15:07:40 +0000 Subject: [PATCH] Performance improvements for pooled caller. Now possible to actually run on real data in a finite amount of time. Minor changes to GL interface (making strandIndex public) to support cached calculations in pooled caller. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2107 348d0f76-0448-11de-a6fe-93d51630548a --- .../genotyper/GenotypeLikelihoods.java | 2 +- .../genotyper/PooledCalculationModel.java | 132 ++++++++++++++---- 2 files changed, 108 insertions(+), 26 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index 5fdf25768..afa009054 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -420,7 +420,7 @@ public class GenotypeLikelihoods implements Cloneable { // // ----------------------------------------------------------------------------------------------------------------- - protected int strandIndex(boolean fwdStrand) { + public static int strandIndex(boolean fwdStrand) { return fwdStrand ? 0 : 1; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java index 9d85e13ad..5cdba36d3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java @@ -12,10 +12,8 @@ import net.sf.samtools.SAMRecord; public class PooledCalculationModel extends JointEstimateGenotypeCalculationModel { private static final String POOL_SAMPLE_NAME = "POOL"; - private FourBaseProbabilities fourBaseLikelihoods = null; - - private double[] log10F = null; - private double[] log10OneMinusF = null; + private static FourBaseProbabilities fourBaseLikelihoods = null; + private static boolean USE_CACHE = true; /** * @@ -31,21 +29,15 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode protected void initialize(char ref, HashMap contexts, StratifiedContext contextType) { super.initialize(ref, contexts, contextType); - // prepare the four base vector calculator - fourBaseLikelihoods = FourBaseProbabilitiesFactory.makeFourBaseLikelihoods(baseModel, defaultPlatform); + // todo -- move this code to a static initializer - // todo - can you add static initialize() - // prepare cached values of the log10 of f and (1-f) - if ( log10F == null ) { - int nChromosomes = 2 * getNSamples(contexts); - log10F = new double[nChromosomes+1]; - log10OneMinusF = new double[nChromosomes+1]; - for ( int i = 0; i < (nChromosomes+1); i++ ) { - double f = (1.0 * i) / nChromosomes; - log10F[i] = Math.log10(f); - log10OneMinusF[i] = Math.log10(1-f); - } - } + // prepare the four base vector calculator + if ( fourBaseLikelihoods == null ) + fourBaseLikelihoods = FourBaseProbabilitiesFactory.makeFourBaseLikelihoods(baseModel, defaultPlatform); + + // setup the cache + if ( CACHE == null ) + makeCache(POOL_SIZE); } protected int getNSamples(HashMap contexts) { @@ -80,12 +72,6 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode return contexts; } - // - // todo - create cache indexed by chromosome, tech, qual, observed base, and potential polymorphism (A/C) for all - // todo -- 16 pairs. We can change the calculation at a locus to: for each read in pileup, calculate offset into - // todo -- this table given observed base, read, and qual. Then, for each i -> 0, 2n and polymorphism, run over - // todo -- the table, summing values - // protected double computeLog10PofDgivenAFi(char refArg, char altArg, double f, HashMap contexts, StratifiedContext contextType) { AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME); ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType)); @@ -95,6 +81,102 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg); int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg); + int nChromosomes = 2 * getNSamples(contexts); + int nAltAlleles = (int)(f * nChromosomes); + + for (int i = 0; i < pileup.getReads().size(); i++) { + int offset = pileup.getOffsets().get(i); + + // ignore deletions + if ( offset == -1 ) + continue; + + SAMRecord read = pileup.getReads().get(i); + char base = (char)read.getReadBases()[offset]; + int bIndex = BaseUtils.simpleBaseToBaseIndex(base); + byte qual = read.getBaseQualities()[offset]; + + if ( qual > 0 && bIndex != -1 ) { + log10L += calcPBGivenH(refIndex, altIndex, nAltAlleles, nChromosomes, base, qual, read, offset); + } + } + + return log10L; + } + + // cache = BMM x PL x REF x ALT x base x QUAL x strand x F as i + static double[][][][][][][][] CACHE = null; + static int N_CACHED = 0; + private static void makeCache(int pool_size) { + CACHE = new double[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][BaseUtils.BASES.length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][2][2 * pool_size+1]; + } + + protected void setCache( int refIndex, int altIndex, int nAltAlleles, char base, byte qual, SAMRecord read, double val ) { + int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal(); + int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read); + int i = refIndex; + int j = altIndex; + int k = BaseUtils.simpleBaseToBaseIndex(base); + int l = qual; + int x = GenotypeLikelihoods.strandIndex(! read.getReadNegativeStrandFlag()); + int f = nAltAlleles; + + N_CACHED++; + System.out.printf("Setting cache value %d %d %d %d %d %d %d %d = %f [count = %d]%n", m, a, i, j, k, l, x, f, val, N_CACHED); + CACHE[m][a][i][j][k][l][x][f] = val; + } + + protected double getCache( int refIndex, int altIndex, int nAltAlleles, char base, byte qual, SAMRecord read ) { + int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal(); + int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read); + int i = refIndex; + int j = altIndex; + int k = BaseUtils.simpleBaseToBaseIndex(base); + int l = qual; + int x = GenotypeLikelihoods.strandIndex(! read.getReadNegativeStrandFlag()); + int f = nAltAlleles; + //System.out.printf("Getting cache value %d %d %d %d %d %d %d %d%n", m, a, i, j, k, l, x, f); + + return CACHE[m][a][i][j][k][l][x][f]; + } + + private double calcPBGivenH(int refIndex, int altIndex, int nAltAlleles, int nChromosomes, char base, byte qual, SAMRecord read, int offset) { + double L = 0.0; + + if ( USE_CACHE ) { + L = getCache(refIndex, altIndex, nAltAlleles, base, qual, read); + if ( L == 0.0 ) { + L = reallyCalcPBGivenH(refIndex, altIndex, nAltAlleles, nChromosomes, base, qual, read, offset); + setCache(refIndex, altIndex, nAltAlleles, base, qual, read, L); + } + } else { + L = reallyCalcPBGivenH(refIndex, altIndex, nAltAlleles, nChromosomes, base, qual, read, offset); + } + + return L; + } + + private double reallyCalcPBGivenH(int refIndex, int altIndex, int nAltAlleles, int nChromosomes, char base, byte qual, SAMRecord read, int offset) { + double f = (1.0 * nAltAlleles) / nChromosomes; + double POfRef = 1 - f; + double POfAlt = f; + + FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(base, qual, read, offset); + double POfBGivenRef = fbl.getLikelihood(refIndex); + double POfBGivenAlt = fbl.getLikelihood(altIndex); + double P = POfRef * POfBGivenRef + POfAlt * POfBGivenAlt; + return Math.log10(P); + } + +/* protected double computeLog10PofDgivenAFi(char refArg, char altArg, double f, HashMap contexts, StratifiedContext contextType) { + AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME); + ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType)); + + double log10L = 0.0; + + int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg); + int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg); + int nChromosomes = 2 * getNSamples(contexts); int nAltAlleles = (int)(f * nChromosomes); int nRefAlleles = nChromosomes - nAltAlleles; @@ -131,7 +213,7 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode // refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L); return log10L; - } + }*/ /* protected double computeLog10PofDgivenAFi_V2(char refArg, char altArg, double f, HashMap contexts, StratifiedContext contextType) { AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME);