From 82fd824c4d2209748fa51b67a487bcb5ccbbcbce Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 20 Nov 2009 01:39:29 +0000 Subject: [PATCH] Continuing improvements to unified genotyper git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2098 348d0f76-0448-11de-a6fe-93d51630548a --- .../genotyper/FourBaseProbabilities.java | 15 +- .../genotyper/GenotypeLikelihoods.java | 8 +- .../genotyper/PooledCalculationModel.java | 147 +++++++++++++++++- 3 files changed, 157 insertions(+), 13 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilities.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilities.java index d1da3e98e..fb5fbaa07 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilities.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilities.java @@ -94,7 +94,10 @@ public abstract class FourBaseProbabilities implements Cloneable { * @return log10 likelihood as a double */ public double getLog10Likelihood(char base) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); + return getLog10Likelihood(BaseUtils.simpleBaseToBaseIndex(base)); + } + + public double getLog10Likelihood(int baseIndex) { return (baseIndex < 0 ? 0.0 : getLog10Likelihoods()[baseIndex]); } @@ -115,10 +118,14 @@ public abstract class FourBaseProbabilities implements Cloneable { * @return likelihoods as a double */ public double getLikelihood(char base) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); + return getLikelihood(BaseUtils.simpleBaseToBaseIndex(base)); + } + + public double getLikelihood(int baseIndex) { return (baseIndex < 0 ? 0.0 : Math.pow(10, log10Likelihoods[baseIndex])); } + // ----------------------------------------------------------------------------------------------------------------- // // @@ -138,7 +145,7 @@ public abstract class FourBaseProbabilities implements Cloneable { * @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example) */ public int add(char observedBase, byte qualityScore, SAMRecord read, int offset) { - FourBaseProbabilities fbl = computeLikelihoods(observedBase, qualityScore, read, offset); + FourBaseProbabilities fbl = computeLog10Likelihoods(observedBase, qualityScore, read, offset); if ( fbl == null ) return 0; @@ -167,7 +174,7 @@ public abstract class FourBaseProbabilities implements Cloneable { * @param offset offset on read * @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example) */ - public FourBaseProbabilities computeLikelihoods(char observedBase, byte qualityScore, SAMRecord read, int offset) { + public FourBaseProbabilities computeLog10Likelihoods(char observedBase, byte qualityScore, SAMRecord read, int offset) { if ( badBase(observedBase) ) { return null; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index e6cd77b1e..5fdf25768 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -114,10 +114,10 @@ public class GenotypeLikelihoods implements Cloneable { private void initialize(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) { fourBaseLikelihoods = FourBaseProbabilitiesFactory.makeFourBaseLikelihoods(m, pl); - initialize(); + setToZero(); } - private void initialize() { + private void setToZero() { log10Likelihoods = zeros.clone(); // likelihoods are all zeros log10Posteriors = priors.getPriors().clone(); // posteriors are all the priors } @@ -372,7 +372,7 @@ public class GenotypeLikelihoods implements Cloneable { } private GenotypeLikelihoods calculateGenotypeLikelihoods(char observedBase, byte qualityScore, SAMRecord read, int offset) { - FourBaseProbabilities fbl = fourBaseLikelihoods.computeLikelihoods(observedBase, qualityScore, read, offset); + FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(observedBase, qualityScore, read, offset); if ( fbl == null ) return null; @@ -380,7 +380,7 @@ public class GenotypeLikelihoods implements Cloneable { try { GenotypeLikelihoods gl = (GenotypeLikelihoods)this.clone(); - gl.initialize(); + gl.setToZero(); // we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space double ploidyAdjustment = log10(FIXED_PLOIDY); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java index 7ff80743c..9d85e13ad 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java @@ -10,11 +10,44 @@ import java.util.*; import net.sf.samtools.SAMRecord; public class PooledCalculationModel extends JointEstimateGenotypeCalculationModel { - private static final String POOL_SAMPLE_NAME = "POOL"; + private FourBaseProbabilities fourBaseLikelihoods = null; + + private double[] log10F = null; + private double[] log10OneMinusF = null; + + /** + * + */ protected PooledCalculationModel() {} + /** + * + * @param ref + * @param contexts + * @param contextType + */ + protected void initialize(char ref, HashMap contexts, StratifiedContext contextType) { + super.initialize(ref, contexts, contextType); + + // prepare the four base vector calculator + fourBaseLikelihoods = FourBaseProbabilitiesFactory.makeFourBaseLikelihoods(baseModel, defaultPlatform); + + // todo - can you add static initialize() + // prepare cached values of the log10 of f and (1-f) + if ( log10F == null ) { + int nChromosomes = 2 * getNSamples(contexts); + log10F = new double[nChromosomes+1]; + log10OneMinusF = new double[nChromosomes+1]; + for ( int i = 0; i < (nChromosomes+1); i++ ) { + double f = (1.0 * i) / nChromosomes; + log10F[i] = Math.log10(f); + log10OneMinusF[i] = Math.log10(1-f); + } + } + } + protected int getNSamples(HashMap contexts) { return POOL_SIZE; } @@ -22,7 +55,6 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode protected HashMap createContexts(AlignmentContext context) { // for testing purposes, we may want to throw multi-samples at pooled mode, // so we can't use the standard splitContextBySample() method here - AlignmentContextBySample pooledContext = new AlignmentContextBySample(context.getLocation()); int deletionsInPileup = 0; @@ -30,7 +62,6 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode List offsets = context.getOffsets(); for (int i = 0; i < reads.size(); i++) { - // check for deletions int offset = offsets.get(i); if ( offset == -1 ) { @@ -48,7 +79,13 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode contexts.put(POOL_SAMPLE_NAME, pooledContext); return contexts; } - + + // + // todo - create cache indexed by chromosome, tech, qual, observed base, and potential polymorphism (A/C) for all + // todo -- 16 pairs. We can change the calculation at a locus to: for each read in pileup, calculate offset into + // todo -- this table given observed base, read, and qual. Then, for each i -> 0, 2n and polymorphism, run over + // todo -- the table, summing values + // protected double computeLog10PofDgivenAFi(char refArg, char altArg, double f, HashMap contexts, StratifiedContext contextType) { AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME); ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType)); @@ -62,6 +99,106 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode int nAltAlleles = (int)(f * nChromosomes); int nRefAlleles = nChromosomes - nAltAlleles; + double log10POfRef = log10OneMinusF[nAltAlleles]; + double log10POfAlt = log10F[nAltAlleles]; + double POfRef = Math.pow(10,log10POfRef); + double POfAlt = Math.pow(10,log10POfAlt); + + for (int i = 0; i < pileup.getReads().size(); i++) { + int offset = pileup.getOffsets().get(i); + + // ignore deletions + if ( offset == -1 ) + continue; + + SAMRecord read = pileup.getReads().get(i); + char base = (char)read.getReadBases()[offset]; + int bIndex = BaseUtils.simpleBaseToBaseIndex(base); + byte qual = read.getBaseQualities()[offset]; + + if ( qual > 0 && bIndex != -1 ) { + FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(base, qual, read, offset); + double POfBGivenRef = fbl.getLikelihood(refIndex); + double POfBGivenAlt = fbl.getLikelihood(altIndex); + double P = POfRef * POfBGivenRef + POfAlt * POfBGivenAlt; + log10L += Math.log10(P); + } + } + + //if ( verboseWriter != null ) + // verboseWriter.printf("POOL_DEBUG %s %c %c %d %d %d %.2f %.2f %.2f %f%n", + // context.getContext(StratifiedContext.OVERALL).getLocation(), + // refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L); + + return log10L; + } + +/* protected double computeLog10PofDgivenAFi_V2(char refArg, char altArg, double f, HashMap contexts, StratifiedContext contextType) { + AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME); + ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType)); + + double log10L = 0.0; + + int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg); + int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg); + + int nChromosomes = 2 * getNSamples(contexts); + int nAltAlleles = (int)(f * nChromosomes); + int nRefAlleles = nChromosomes - nAltAlleles; + + double log10POfRef = log10OneMinusF[nAltAlleles]; + double log10POfAlt = log10F[nAltAlleles]; + + for (int i = 0; i < pileup.getReads().size(); i++) { + int offset = pileup.getOffsets().get(i); + + // ignore deletions + if ( offset == -1 ) + continue; + + SAMRecord read = pileup.getReads().get(i); + char base = (char)read.getReadBases()[offset]; + int bIndex = BaseUtils.simpleBaseToBaseIndex(base); + byte qual = read.getBaseQualities()[offset]; + + double log10POfNotB = Math.log10(QualityUtils.qualToErrorProb(qual)); + if ( qual > 0 && bIndex != -1 ) { + FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(base, qual, read, offset); + + double log10POfB = fbl.getLog10Likelihood(base); + + if ( bIndex == refIndex && nRefAlleles > 0 ) { + log10L += log10POfRef + log10POfB; + } else if ( bIndex == altIndex && nAltAlleles > 0) { + log10L += log10POfAlt + log10POfB; + } else { + //log10L += Math.min(log10POfRef + fbl.getLog10Likelihood(refIndex), log10POfAlt + fbl.getLog10Likelihood(altIndex)); + log10L += log10POfNotB; + } + } + } + + if ( verboseWriter != null ) + verboseWriter.printf("POOL_DEBUG %s %c %c %d %d %d %.2f %.2f %.2f %f%n", + context.getContext(StratifiedContext.OVERALL).getLocation(), + refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L); + + return log10L; + } */ + +/* protected double computeLog10PofDgivenAFi_V1(char refArg, char altArg, double f, HashMap contexts, StratifiedContext contextType) { + AlignmentContextBySample context = contexts.get(POOL_SAMPLE_NAME); + ReadBackedPileup pileup = new ReadBackedPileup(refArg, context.getContext(contextType)); + + double log10L = 0.0; + + int refIndex = BaseUtils.simpleBaseToBaseIndex(refArg); + int altIndex = BaseUtils.simpleBaseToBaseIndex(altArg); + + int nChromosomes = 2 * getNSamples(contexts); + int nAltAlleles = (int)(f * nChromosomes); + int nRefAlleles = nChromosomes - nAltAlleles; + double log10POfRef = Math.log10(1 - f); double log10POfAlt = Math.log10(f); //double log10ChromChooseRef = Math.log10(Arithmetic.binomial(nChromosomes, nRefAlleles)); @@ -94,5 +231,5 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode refArg, altArg, nChromosomes, nAltAlleles, nRefAlleles, f, log10POfRef, log10POfAlt, log10L); return log10L; - } + }*/ } \ No newline at end of file