From ffcd672c1c4f5df07e1d746ee9ad9128a404af1e Mon Sep 17 00:00:00 2001 From: kiran Date: Wed, 22 Apr 2009 18:06:50 +0000 Subject: [PATCH] Intermediate commit while working on getting four-base probs to work in the single sample genotyper. Has infrastructure for the new combinatorial approach and just choosing the best base more intelligently given a probability distribution over bases and the reference base. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@492 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/SingleSampleGenotyper.java | 281 +++++++++++++++--- 1 file changed, 234 insertions(+), 47 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java index 711d3c30f..7421b2894 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java @@ -5,6 +5,10 @@ import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.playground.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; import net.sf.samtools.SAMRecord; @@ -13,59 +17,232 @@ import java.util.List; // Draft single sample genotyper // j.maguire 3-7-2009 -public class SingleSampleGenotyper extends LocusWalker -{ - AlleleMetrics metrics; +public class SingleSampleGenotyper extends LocusWalker { + @Argument(fullName="fourBaseMode",required=false,defaultValue="false") + public Boolean fourBaseMode; + + @Argument(fullName="decideOnBase",required=false,defaultValue="false") + public Boolean decideOnBase; - public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) - { - return true; // We are keeping all the reads + private AlleleMetrics metrics; + + public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; } // We are keeping all the reads + public boolean requiresReads() { return true; } + public void initialize() { metrics = new AlleleMetrics("metrics.out"); } + + public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) { + String rodString = getRodString(tracker); + + AlleleFrequencyEstimate freq = null; + if (fourBaseMode) { + // Compute four-base prob genotype likelihoods + freq = getFourProbAlleleFrequency(ref, context, rodString); + } else if (decideOnBase) { + } else { + // Compute single quality score genotype likelihoods + freq = getOneProbAlleleFrequency(ref, context, rodString); + } + + if (freq != null) { metrics.nextPosition(freq, tracker); } + metrics.printMetricsAtLocusIntervals(1000); + + return freq; } - public boolean requiresReads() { return true; } + private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString) { + /* + P(D)*P(G,q|D) = P(D|G,q)*P(G,q) + = P(D|q)*P(q|G)*P(G) - public void initialize() - { - metrics = new AlleleMetrics("metrics.out"); - } + P(G) = { 0.999 (hom-ref), 1e-3 (het), 1e-5 (hom-nonref) } + + n + P(D|q)P(q|G) = product [ P_i(A)*B(i, n, G) ] + i=1 + */ - public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) - { - List reads = context.getReads(); - List offsets = context.getOffsets(); - String bases = ""; - String quals = ""; + double[][] probs = ReadBackedPileup.probDistPileup(context.getReads(), context.getOffsets()); + int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref); + int altBaseIndex = getMostFrequentNonRefBase(getFractionalCounts(probs), refBaseIndex); - ref = Character.toUpperCase(ref); + if (refBaseIndex >= 0 && altBaseIndex >= 0) { + System.out.println(context.getLocation().toString() + " " + refBaseIndex + " " + altBaseIndex + " " + probs.length + " " + rodString); + for (int i = 0; i < probs.length; i++) { + System.out.printf(" [ %4.4f %4.4f %4.4f %4.4f ]\n", probs[i][0], probs[i][1], probs[i][2], probs[i][3]); + } + + double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex); - String rodString = ""; - // Look up dbsnp priors - for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) - { - if ( datum != null ) - { - if ( datum instanceof rodDbSNP) - { - rodDbSNP dbsnp = (rodDbSNP)datum; - rodString += dbsnp.toString(); + System.out.print(" Weights: "); + for (int i = 0; i < obsWeights.length; i++) { + System.out.printf("%4.4f ", obsWeights[i]); + } + System.out.println(); + + double[] genotypePriors = { 0.999, 1e-3, 1e-5 }; + double[] genotypeBalances = { 0.02, 0.5, 0.98 }; + + double[] posteriors = new double[3]; + double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, lodVsNextBest = 0.0, pBest = Double.MIN_VALUE; + + for (int hypothesis = 0; hypothesis < 3; hypothesis++) { + posteriors[hypothesis] = 0.0; + + for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) { + posteriors[hypothesis] += obsWeights[weightIndex]*binomialProb(weightIndex, probs.length, genotypeBalances[hypothesis]); } - else - { - rodString += datum.toSimpleString(); + posteriors[hypothesis] *= genotypePriors[hypothesis]; + + System.out.printf(" Hypothesis %d %f %f %f\n", hypothesis, genotypeBalances[hypothesis], posteriors[hypothesis], Math.log10(posteriors[hypothesis]/posteriors[0])); + + if (posteriors[hypothesis] > pBest) { + qhat = genotypeBalances[hypothesis]; + qstar = genotypeBalances[hypothesis]; + lodVsRef = Math.log10(posteriors[hypothesis]/posteriors[0]); + pBest = posteriors[hypothesis]; + } + } + + double pRef = posteriors[0]; + + System.out.println("\n"); + + return new AlleleFrequencyEstimate(context.getLocation(), + ref, + BaseUtils.baseIndexToSimpleBase(altBaseIndex), + 2, + qhat, + qstar, + lodVsRef, + lodVsNextBest, + pBest, + pRef, + probs.length, + ReadBackedPileup.basePileupAsString(context.getReads(), context.getOffsets()), + probs, + posteriors); + + } + + return null; + } + + double binomialProb(long k, long n, double p) { + // k - number of successes + // n - number of Bernoulli trials + // p - probability of success + + if ((n*p < 5) && (n*(1-p) < 5)) + { + // For small n and the edges, compute it directly. + return (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k); + } + else + { + // For large n, approximate with a gaussian. + double mean = (double)(n*p); + double var = Math.sqrt((double)(n*p)*(1.0-p)); + double ans = (double)(1.0 / (var*Math.sqrt(2*Math.PI)))*Math.exp(-1.0 * Math.pow((double)k-mean,2)/(2.0*var*var)); + double check = (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k); + double residual = ans - check; + + return check; + } + } + + long nchoosek(long n, long k) { + long m = n - k; + if (k < m) + k = m; + + long t = 1; + for (long i = n, j = 1; i > k; i--, j++) + t = t * i / j; + return t; + } + + private double[] getObservationWeights(double[][] probs, int refBaseIndex, int altBaseIndex) { + if (probs.length <= 10) { + return getWeightTableTraces(getWeightTable(probs, refBaseIndex, altBaseIndex, probs.length)); + } + + return new double[probs.length + 1]; + } + + private double[][] getWeightTable(double[][] probs, int refBaseIndex, int altBaseIndex, int numReadsToConsider) { + if (numReadsToConsider == 1) { + double[][] partialProbTable = new double[1][2]; + partialProbTable[0][0] = probs[0][refBaseIndex]; + partialProbTable[0][1] = probs[0][altBaseIndex]; + + return partialProbTable; + } + + double[][] oldPartialProbTable = getWeightTable(probs, refBaseIndex, altBaseIndex, numReadsToConsider - 1); + double[] traces = getWeightTableTraces(oldPartialProbTable); + + double[][] newPartialProbTable = new double[numReadsToConsider][2]; + for (int row = 0, traceElement = traces.length - 1; row < newPartialProbTable.length; row++, traceElement--) { + newPartialProbTable[row][0] = traces[traceElement]*probs[numReadsToConsider - 1][refBaseIndex]; + newPartialProbTable[row][1] = traces[traceElement]*probs[numReadsToConsider - 1][altBaseIndex]; + } + + return newPartialProbTable; + } + + private double[] getWeightTableTraces(double[][] partialProbTable) { + double[] traces = new double[partialProbTable.length + 1]; + + traces[0] = partialProbTable[partialProbTable.length - 1][0]; + traces[partialProbTable.length] = partialProbTable[0][1]; + + for (int element = 1; element < traces.length - 1; element++) { + traces[element] = partialProbTable[partialProbTable.length - element - 1][0] + + partialProbTable[partialProbTable.length - element][1]; + } + + return traces; + } + + private double[] getFractionalCounts(double[][] probs) { + double[] fractionalCounts = new double[4]; + + for (int i = 0; i < probs.length; i++) { + for (int j = 0; j < 4; j++) { + fractionalCounts[j] += probs[i][j]; + } + } + + return fractionalCounts; + } + + private int getMostFrequentNonRefBase(double[] fractionalCounts, int refBaseIndex) { + double maxFractionalCounts = -1.0; + int bestAltBaseIndex = -1; + for (int altBaseIndex = 0; altBaseIndex < 4; altBaseIndex++) { + if (altBaseIndex != refBaseIndex) { + if (fractionalCounts[altBaseIndex] > maxFractionalCounts) { + maxFractionalCounts = fractionalCounts[altBaseIndex]; + bestAltBaseIndex = altBaseIndex; } } } - if ( rodString != "" ) - rodString = "[ROD: " + rodString + "]"; - // Accumulate genotype likelihoods - GenotypeLikelihoods G = new GenotypeLikelihoods(); - for ( int i = 0; i < reads.size(); i++ ) - { + return bestAltBaseIndex; + } + + private AlleleFrequencyEstimate getOneProbAlleleFrequency(char ref, LocusContext context, String rodString) { + ReadBackedPileup pileup = new ReadBackedPileup(ref, context); + String bases = pileup.getBases(); + + List reads = context.getReads(); + List offsets = context.getOffsets(); + ref = Character.toUpperCase(ref); + + GenotypeLikelihoods G = new GenotypeLikelihoods(); + for ( int i = 0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); int offset = offsets.get(i); - bases += read.getReadString().charAt(offset); - quals += read.getBaseQualityString().charAt(offset); G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); } @@ -73,18 +250,28 @@ public class SingleSampleGenotyper extends LocusWalker