diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java index e21033ba4..e44210f4e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.playground.utils.PoolUtils; import net.sf.samtools.SAMRecord; import java.util.*; @@ -20,7 +21,7 @@ import java.util.*; * Time: 11:57:07 AM * To change this template use File | Settings | File Templates. */ -public class CoverageAndPowerWalker extends LocusWalker> { +public class CoverageAndPowerWalker extends LocusWalker, Pair> { @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) public boolean suppress_printing = false; @@ -28,7 +29,7 @@ public class CoverageAndPowerWalker extends LocusWalker reduceInit() { + return new Pair(0l,0l); + } - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - if ( !suppress_printing ) - { - Pair powpair = boostrapSamplingPowerCalc(context); - out.printf("%s: %d %d %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first); + public Pair reduce(Pair newCvg, Pair prevCvg) { + return new Pair(prevCvg.first + newCvg.first, prevCvg.second + newCvg.second); + } + + public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + Pair,List>,Pair,List>> readsByDirection = PoolUtils.splitReadsByReadDirection(context.getReads(),context.getOffsets()); + if ( ! suppress_printing) { + Pair powers = calculatePower(readsByDirection, useBootstrap, context); + out.printf("%s: %d %d %d %d %d %d %f %f %f%n", context.getLocation(), readsByDirection.getFirst().getFirst().size(), readsByDirection.getFirst().getSecond().size(), + context.getReads().size(), powers.getSecond()[0], powers.getSecond()[1], powers.getSecond()[2], + powers.getFirst()[0], powers.getFirst()[1], powers.getFirst()[2]); + } + return new Pair(readsByDirection.getFirst().getFirst().size(),readsByDirection.getFirst().getSecond().size()); + } + + // helper methods + + public Pair calculatePower(Pair,List>,Pair,List>> dirReads, boolean bootStrap, AlignmentContext context) { + Pair powerAndMedianQ; + if( bootStrap ) { + powerAndMedianQ = calculateDirectionalPowersByBootstrap(dirReads,threshold,context); + } else { + powerAndMedianQ = calculateDirectionalPowersTheoretical(dirReads,threshold,context); } - return context.getReads().size(); + return powerAndMedianQ; } - - public boolean isReduceByInterval() { - return true; + public Pair calculateDirectionalPowersTheoretical(Pair,List>,Pair,List>> dirReads, double thresh, AlignmentContext context) { + byte[] medQs = getMedianQScores(dirReads, context); + double powerForward = calculatePowerTheoretical(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst(),medQs[0],thresh); + double powerReverse = calculatePowerTheoretical(dirReads.getFirst().getSecond(), dirReads.getSecond().getSecond(),medQs[1],thresh); + double powerCombined = calculatePowerTheoretical(context.getReads(),context.getOffsets(),medQs[2],thresh); + double [] powers = {powerForward,powerReverse,powerCombined}; + return new Pair(powers, medQs); } - - public Pair reduceInit() { - - return new Pair(0l,0l); } - - - public Pair reduce(Integer value, Pair sum) - { - long left = value.longValue() + sum.getFirst(); - long right = sum.getSecond() + 1l; - return new Pair(left, right); + public byte[] getMedianQScores(Pair,List>,Pair,List>> dirReads, AlignmentContext context) { + byte medQForward = getMedianQualityScore(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst()); + byte medQReverse = getMedianQualityScore(dirReads.getFirst().getSecond(), dirReads.getSecond().getSecond()); + byte medQCombined = getMedianQualityScore(context.getReads(),context.getOffsets()); + byte [] medQs = {medQForward, medQReverse, medQCombined}; + return medQs; } - - public void onTraversalDone(Pair result) { - out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n", - ((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond()); + public byte getMedianQualityScore(List reads, List offsets) { + return ListUtils.getQScoreMedian(reads,offsets); } - /* - * - * Helper Methods Below - * - */ - - public Pair powerTheoretical(int depth, List reads, List offsets, double snp_prop) - { - // get the median Q score for these reads - - byte medQ = ListUtils.getQScoreMedian(reads,offsets); - // System.out.println("Median Q: " + medQ); //TODO: remove this line + public double calculatePowerTheoretical(List reads, List offsets, byte medQ, double thresh) { double p_error = QualityUtils.qualToErrorProb(medQ); + int depth = reads.size(); + if(depth <= 0) { + return 0; + } + double snpProp = this.getSNPProportion(1); // variable names from here on out come from the mathematics of a log likelihood test // with binomial probabilities, of observing k bases consistent with the same SNP @@ -99,128 +111,97 @@ public class CoverageAndPowerWalker extends LocusWalker depth/2 - calculate power as P[hits between k and depth] - for(int k = kaccept; k < depth; k++) { - pow += MathUtils.binomialProbabilityLog(k, depth, snp_prop); - } - - return new Pair(pow,medQ); - } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and k] - for(int k = 0; k < kaccept; k++) { - pow += MathUtils.binomialProbabilityLog(k,depth,snp_prop); - } - - return new Pair(1.0-pow,medQ); + if ( depth - kaccept < kaccept ) {// kaccept > depth/2 - calculate power as P[hits between kaccept and depth] + return MathUtils.cumBinomialProbLog(kaccept, depth, depth, snpProp); + } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and kaccept] + return 1-MathUtils.cumBinomialProbLog(0,kaccept,depth,snpProp); } } + public double getSNPProportion(int numSNPAlleles) { + return ((double) numSNPAlleles / (2.0 * num_individuals)); + } - public Pair boostrapSamplingPowerCalc(AlignmentContext context) - { - /* it is assumed that there are 2*num_individuals chromosomes in the pool - * we use a bootstrap method to calculate our power to detect a SNP with this - * depth of coverage - * Assume that only one of the individual chromosomes has a variant at this locus - * - */ - - List reads = context.getReads(); - List offsets = context.getOffsets(); - - final int depth = reads.size(); - Pair result; - final double single_snip_proportion = 1/(2.0*num_individuals); - - if (depth <= 0) { - result = new Pair(-1,-1); - } else if (!use_bootstrap) { // object data from command line - result = powerTheoretical(depth, reads, offsets, single_snip_proportion); - } else { // otherwise, bootstrapping occurs below - int hypothesis_rejections=0; - - for(int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++) - { - List qscore_SNPs = new LinkedList(); - List qscore_refs = new LinkedList(); - - for (int strap = 0; strap < depth; strap++) { - int readOffset = randomlySelectRead(depth); - byte baseQuality = reads.get(readOffset).getBaseQualities()[offsets.get(readOffset)]; - - if (Math.random() < single_snip_proportion) {// occurs with probability "single_snip_proportion" - qscore_SNPs.add(baseQuality); - } - else { // simulating a reference read - qscore_refs.add(baseQuality); - } - } - - if (calculateLODByQLists(qscore_SNPs,qscore_refs) >= threshold) { - hypothesis_rejections++; - } + public Pair calculateDirectionalPowersByBootstrap(Pair,List>,Pair,List>> dirReads, double thresh, AlignmentContext context) { + double powerForward = bootstrapPower(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst(),thresh); + double powerReverse = bootstrapPower(dirReads.getFirst().getSecond(),dirReads.getSecond().getSecond(),thresh); + double powerCombined = bootstrapPower(context.getReads(),context.getOffsets(),thresh); + double[] powers = {powerForward, powerReverse, powerCombined}; + return new Pair(powers,getMedianQScores(dirReads,context)); + } + public double bootstrapPower(List reads, List offsets, double thresh) { + double power = 0.0; + for ( int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++) { + Pair,List>,Pair,List>> snpReadsAndRefReads = coinTossPartition(reads,offsets,this.getSNPProportion(1)); + if( calculateLogLikelihoodOfSample(snpReadsAndRefReads, num_individuals) > thresh) { + power += 1.0/BOOTSTRAP_ITERATIONS; } - - result = new Pair(((double)hypothesis_rejections)/BOOTSTRAP_ITERATIONS, ListUtils.getQScoreMedian(reads,offsets)); } - return result; - + return power; } + // class methods - public int randomlySelectRead(int depth) - { - return (int) Math.floor((double)depth * Math.random()); + public static Pair,List>,Pair,List>> coinTossPartition(List reads, List offsets, double snpProb) { + Pair,List>,Pair,List>> partitionedReads; + if( reads == null || offsets == null ) { + partitionedReads = null; + } else { + ArrayList snpReads = new ArrayList(); + ArrayList refReads = new ArrayList(); + ArrayList snpOff = new ArrayList(); + ArrayList refOff = new ArrayList(); + for(int readNo = 0; readNo < reads.size(); readNo++) { + if ( Math.random() < snpProb) { + snpReads.add(reads.get(readNo)); + snpOff.add(offsets.get(readNo)); + } else { + refReads.add(reads.get(readNo)); + refOff.add(offsets.get(readNo)); + } + } + partitionedReads= new Pair(new Pair(snpReads,refReads), new Pair(snpOff, refOff)); + } + + return partitionedReads; } + public static double calculateLogLikelihoodOfSample(Pair,List>,Pair,List>> snpReadsRefReads, int nIndivids) { + List qListSnps = getQList(snpReadsRefReads.getFirst().getFirst(),snpReadsRefReads.getSecond().getFirst()); + List qListRefs = getQList(snpReadsRefReads.getFirst().getSecond(),snpReadsRefReads.getSecond().getSecond()); + Pair logsumSNP = qListToSumLogProbabilities(true,qListSnps, 2.0*nIndivids); + Pair logsumRef = qListToSumLogProbabilities(false,qListRefs, 2.0*nIndivids); - public double calculateLODByQLists(List q_snp, List q_ref) - { - /* LOD score calculation - * let Qs be the list q_snp and Qr q_ref and f be a function that - * converts Q-scores to probability of error, then the probability ratio - * is given by - * - * Product_{i=1}^{|Qs|}((1-f(Qs[i])/2n+(2n-1)f(Qs[i])/2n)*Product_{j=1}^{|Qr|}(((2n-1)(1-f(Qr[j]))+f(Qr[j]))/2n) - * ------------------------------------------------------------------------------------------------------------ - * Product_{i=1}^{|Qs|}f(Qs[i])*Product_{j=1}^{|Qr|}(1-f(Qr[j])) - * - * since depth = |Qr|+|Qs| the binomial coefficients cancel so are not present in this formula - * - * - * - * - * the "LOD" used here is really a log-likelihood, just the logarithm of the above - * - * Helper functions compute all but the final sum (log of mult/div becomes add/subtract) - * - */ - - Pair logsumSNP = qListToSumLogProbabilities(true,q_snp); - Pair logsumRef = qListToSumLogProbabilities(false,q_ref); - - return 0 - logsumSNP.first - logsumRef.first + logsumSNP.second + logsumRef.second; + return 0.0 - logsumSNP.first - logsumRef.first + logsumSNP.second + logsumRef.second; } + public static List getQList(List reads, List offsets) { + List qscores = new LinkedList(); + for(int readNo = 0; readNo < reads.size(); readNo++) { + qscores.add(reads.get(readNo).getBaseQualities()[offsets.get(readNo)]); + } - public Pair qListToSumLogProbabilities(boolean listRepresentsSNPObservations, List qList) + return qscores; + } + + public static Pair qListToSumLogProbabilities(boolean listRepresentsSNPObservations, List qList, double denom) { double logProbObserveXAndSNPTrue = 0; // note "error" for SNP is observing a ref double logProbObserveXAndRefTrue = 0;// and "error" for ref is observing a SNP - final double denom = 2*((double)num_individuals); for (byte qual : qList) { double p_err = QualityUtils.qualToErrorProb(qual); @@ -236,6 +217,4 @@ public class CoverageAndPowerWalker extends LocusWalker(logProbObserveXAndSNPTrue,logProbObserveXAndRefTrue); } - - } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java b/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java new file mode 100755 index 000000000..b87f89b7f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java @@ -0,0 +1,52 @@ +package org.broadinstitute.sting.playground.utils; + +import net.sf.samtools.SAMRecord; + +import java.util.List; +import java.util.ArrayList; + +import org.broadinstitute.sting.utils.Pair; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Aug 27, 2009 + * Time: 12:31:08 PM + * To change this template use File | Settings | File Templates. + */ +public class PoolUtils { + + private PoolUtils () {} + + public static Pair, List>,Pair,List>> splitReadsByReadDirection(List reads, List offsets) { + ArrayList forwardReads; + ArrayList reverseReads; + ArrayList forwardOffsets; + ArrayList reverseOffsets; + + if ( reads == null) { + forwardReads = null; + reverseReads = null; + forwardOffsets = null; + reverseOffsets = null; + } else { + forwardReads = new ArrayList(); + reverseReads = new ArrayList(); + forwardOffsets = new ArrayList(); + reverseOffsets = new ArrayList(); + + for ( int readNo = 0; readNo < reads.size(); readNo ++ ) { + if ( reads.get(readNo).getReadNegativeStrandFlag() ) { + forwardReads.add(reads.get(readNo)); + forwardOffsets.add(offsets.get(readNo)); + } else { + reverseReads.add(reads.get(readNo)); + reverseOffsets.add(offsets.get(readNo)); + } + } + } + + return new Pair(new Pair(forwardReads,reverseReads), new Pair(forwardOffsets,reverseOffsets)); + } + +} diff --git a/java/src/org/broadinstitute/sting/utils/ListUtils.java b/java/src/org/broadinstitute/sting/utils/ListUtils.java index 7ad03a03a..15790c495 100644 --- a/java/src/org/broadinstitute/sting/utils/ListUtils.java +++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java @@ -110,7 +110,10 @@ public class ListUtils { // list index maps to a q-score only through the offset index // returns the kth-largest q-score. - + if( reads.size() == 0) { + return 0; + } + ArrayList lessThanQReads = new ArrayList(); ArrayList equalToQReads = new ArrayList(); ArrayList greaterThanQReads = new ArrayList(); diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 4f48b1e61..2aa2c44c0 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -169,6 +169,23 @@ public class MathUtils { // the sum of log(i) from i = (1+(50*a)) to i = 50+(50*b) to increase performance on binomial coefficients // which may require many sums. } + + /** + * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. + * @param start - start of the cumulant sum (over hits) + * @param end - end of the cumulant sum (over hits) + * @param total - number of attempts for the number of hits + * @param probHit - probability of a successful hit + * @return - returns the cumulative probability + */ + public static double cumBinomialProbLog(int start, int end, int total, double probHit) { + double cumProb = 0.0; + for(int hits = start; hits < end; hits++) { + cumProb += binomialProbabilityLog(hits, total, probHit); + } + + return cumProb; + } /** * Computes a multinomial. This is computed using the formula