From 8740124cda7ca4cbe3beb4db05296a6daf242769 Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 27 Aug 2009 19:31:53 +0000 Subject: [PATCH] @ListUtils - Bugfix in getQScoreOrderStatistic: method would attempt to access an empty list fed into it. Now it checks for null pointers and returns 0. @MathUtils - added a new method: cumBinomialProbLog which calculates a cumulant from any start point to any end point using the BinomProbabilityLog calculation. @PoolUtils - added a new utility class specifically for items related to pooled sequencing. A major part of the power calculation is now to calculate powers independently by read direction. The only method in this class (currently) takes your reads and offsets, and splits them into two groups by read direction. @CoverageAndPowerWalker - completely rewritten to split coverage, median qualities, and power by read direction. Makes use of cumBinomialProbLog rather than doing that calculation within the object itself. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1462 348d0f76-0448-11de-a6fe-93d51630548a --- .../poolseq/CoverageAndPowerWalker.java | 251 ++++++++---------- .../sting/playground/utils/PoolUtils.java | 52 ++++ .../broadinstitute/sting/utils/ListUtils.java | 5 +- .../broadinstitute/sting/utils/MathUtils.java | 17 ++ 4 files changed, 188 insertions(+), 137 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java index e21033ba4..e44210f4e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.playground.utils.PoolUtils; import net.sf.samtools.SAMRecord; import java.util.*; @@ -20,7 +21,7 @@ import java.util.*; * Time: 11:57:07 AM * To change this template use File | Settings | File Templates. */ -public class CoverageAndPowerWalker extends LocusWalker> { +public class CoverageAndPowerWalker extends LocusWalker, Pair> { @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) public boolean suppress_printing = false; @@ -28,7 +29,7 @@ public class CoverageAndPowerWalker extends LocusWalker reduceInit() { + return new Pair(0l,0l); + } - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - if ( !suppress_printing ) - { - Pair powpair = boostrapSamplingPowerCalc(context); - out.printf("%s: %d %d %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first); + public Pair reduce(Pair newCvg, Pair prevCvg) { + return new Pair(prevCvg.first + newCvg.first, prevCvg.second + newCvg.second); + } + + public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + Pair,List>,Pair,List>> readsByDirection = PoolUtils.splitReadsByReadDirection(context.getReads(),context.getOffsets()); + if ( ! suppress_printing) { + Pair powers = calculatePower(readsByDirection, useBootstrap, context); + out.printf("%s: %d %d %d %d %d %d %f %f %f%n", context.getLocation(), readsByDirection.getFirst().getFirst().size(), readsByDirection.getFirst().getSecond().size(), + context.getReads().size(), powers.getSecond()[0], powers.getSecond()[1], powers.getSecond()[2], + powers.getFirst()[0], powers.getFirst()[1], powers.getFirst()[2]); + } + return new Pair(readsByDirection.getFirst().getFirst().size(),readsByDirection.getFirst().getSecond().size()); + } + + // helper methods + + public Pair calculatePower(Pair,List>,Pair,List>> dirReads, boolean bootStrap, AlignmentContext context) { + Pair powerAndMedianQ; + if( bootStrap ) { + powerAndMedianQ = calculateDirectionalPowersByBootstrap(dirReads,threshold,context); + } else { + powerAndMedianQ = calculateDirectionalPowersTheoretical(dirReads,threshold,context); } - return context.getReads().size(); + return powerAndMedianQ; } - - public boolean isReduceByInterval() { - return true; + public Pair calculateDirectionalPowersTheoretical(Pair,List>,Pair,List>> dirReads, double thresh, AlignmentContext context) { + byte[] medQs = getMedianQScores(dirReads, context); + double powerForward = calculatePowerTheoretical(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst(),medQs[0],thresh); + double powerReverse = calculatePowerTheoretical(dirReads.getFirst().getSecond(), dirReads.getSecond().getSecond(),medQs[1],thresh); + double powerCombined = calculatePowerTheoretical(context.getReads(),context.getOffsets(),medQs[2],thresh); + double [] powers = {powerForward,powerReverse,powerCombined}; + return new Pair(powers, medQs); } - - public Pair reduceInit() { - - return new Pair(0l,0l); } - - - public Pair reduce(Integer value, Pair sum) - { - long left = value.longValue() + sum.getFirst(); - long right = sum.getSecond() + 1l; - return new Pair(left, right); + public byte[] getMedianQScores(Pair,List>,Pair,List>> dirReads, AlignmentContext context) { + byte medQForward = getMedianQualityScore(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst()); + byte medQReverse = getMedianQualityScore(dirReads.getFirst().getSecond(), dirReads.getSecond().getSecond()); + byte medQCombined = getMedianQualityScore(context.getReads(),context.getOffsets()); + byte [] medQs = {medQForward, medQReverse, medQCombined}; + return medQs; } - - public void onTraversalDone(Pair result) { - out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n", - ((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond()); + public byte getMedianQualityScore(List reads, List offsets) { + return ListUtils.getQScoreMedian(reads,offsets); } - /* - * - * Helper Methods Below - * - */ - - public Pair powerTheoretical(int depth, List reads, List offsets, double snp_prop) - { - // get the median Q score for these reads - - byte medQ = ListUtils.getQScoreMedian(reads,offsets); - // System.out.println("Median Q: " + medQ); //TODO: remove this line + public double calculatePowerTheoretical(List reads, List offsets, byte medQ, double thresh) { double p_error = QualityUtils.qualToErrorProb(medQ); + int depth = reads.size(); + if(depth <= 0) { + return 0; + } + double snpProp = this.getSNPProportion(1); // variable names from here on out come from the mathematics of a log likelihood test // with binomial probabilities, of observing k bases consistent with the same SNP @@ -99,128 +111,97 @@ public class CoverageAndPowerWalker extends LocusWalker depth/2 - calculate power as P[hits between k and depth] - for(int k = kaccept; k < depth; k++) { - pow += MathUtils.binomialProbabilityLog(k, depth, snp_prop); - } - - return new Pair(pow,medQ); - } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and k] - for(int k = 0; k < kaccept; k++) { - pow += MathUtils.binomialProbabilityLog(k,depth,snp_prop); - } - - return new Pair(1.0-pow,medQ); + if ( depth - kaccept < kaccept ) {// kaccept > depth/2 - calculate power as P[hits between kaccept and depth] + return MathUtils.cumBinomialProbLog(kaccept, depth, depth, snpProp); + } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and kaccept] + return 1-MathUtils.cumBinomialProbLog(0,kaccept,depth,snpProp); } } + public double getSNPProportion(int numSNPAlleles) { + return ((double) numSNPAlleles / (2.0 * num_individuals)); + } - public Pair boostrapSamplingPowerCalc(AlignmentContext context) - { - /* it is assumed that there are 2*num_individuals chromosomes in the pool - * we use a bootstrap method to calculate our power to detect a SNP with this - * depth of coverage - * Assume that only one of the individual chromosomes has a variant at this locus - * - */ - - List reads = context.getReads(); - List offsets = context.getOffsets(); - - final int depth = reads.size(); - Pair result; - final double single_snip_proportion = 1/(2.0*num_individuals); - - if (depth <= 0) { - result = new Pair(-1,-1); - } else if (!use_bootstrap) { // object data from command line - result = powerTheoretical(depth, reads, offsets, single_snip_proportion); - } else { // otherwise, bootstrapping occurs below - int hypothesis_rejections=0; - - for(int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++) - { - List qscore_SNPs = new LinkedList(); - List qscore_refs = new LinkedList(); - - for (int strap = 0; strap < depth; strap++) { - int readOffset = randomlySelectRead(depth); - byte baseQuality = reads.get(readOffset).getBaseQualities()[offsets.get(readOffset)]; - - if (Math.random() < single_snip_proportion) {// occurs with probability "single_snip_proportion" - qscore_SNPs.add(baseQuality); - } - else { // simulating a reference read - qscore_refs.add(baseQuality); - } - } - - if (calculateLODByQLists(qscore_SNPs,qscore_refs) >= threshold) { - hypothesis_rejections++; - } + public Pair calculateDirectionalPowersByBootstrap(Pair,List>,Pair,List>> dirReads, double thresh, AlignmentContext context) { + double powerForward = bootstrapPower(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst(),thresh); + double powerReverse = bootstrapPower(dirReads.getFirst().getSecond(),dirReads.getSecond().getSecond(),thresh); + double powerCombined = bootstrapPower(context.getReads(),context.getOffsets(),thresh); + double[] powers = {powerForward, powerReverse, powerCombined}; + return new Pair(powers,getMedianQScores(dirReads,context)); + } + public double bootstrapPower(List reads, List offsets, double thresh) { + double power = 0.0; + for ( int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++) { + Pair,List>,Pair,List>> snpReadsAndRefReads = coinTossPartition(reads,offsets,this.getSNPProportion(1)); + if( calculateLogLikelihoodOfSample(snpReadsAndRefReads, num_individuals) > thresh) { + power += 1.0/BOOTSTRAP_ITERATIONS; } - - result = new Pair(((double)hypothesis_rejections)/BOOTSTRAP_ITERATIONS, ListUtils.getQScoreMedian(reads,offsets)); } - return result; - + return power; } + // class methods - public int randomlySelectRead(int depth) - { - return (int) Math.floor((double)depth * Math.random()); + public static Pair,List>,Pair,List>> coinTossPartition(List reads, List offsets, double snpProb) { + Pair,List>,Pair,List>> partitionedReads; + if( reads == null || offsets == null ) { + partitionedReads = null; + } else { + ArrayList snpReads = new ArrayList(); + ArrayList refReads = new ArrayList(); + ArrayList snpOff = new ArrayList(); + ArrayList refOff = new ArrayList(); + for(int readNo = 0; readNo < reads.size(); readNo++) { + if ( Math.random() < snpProb) { + snpReads.add(reads.get(readNo)); + snpOff.add(offsets.get(readNo)); + } else { + refReads.add(reads.get(readNo)); + refOff.add(offsets.get(readNo)); + } + } + partitionedReads= new Pair(new Pair(snpReads,refReads), new Pair(snpOff, refOff)); + } + + return partitionedReads; } + public static double calculateLogLikelihoodOfSample(Pair,List>,Pair,List>> snpReadsRefReads, int nIndivids) { + List qListSnps = getQList(snpReadsRefReads.getFirst().getFirst(),snpReadsRefReads.getSecond().getFirst()); + List qListRefs = getQList(snpReadsRefReads.getFirst().getSecond(),snpReadsRefReads.getSecond().getSecond()); + Pair logsumSNP = qListToSumLogProbabilities(true,qListSnps, 2.0*nIndivids); + Pair logsumRef = qListToSumLogProbabilities(false,qListRefs, 2.0*nIndivids); - public double calculateLODByQLists(List q_snp, List q_ref) - { - /* LOD score calculation - * let Qs be the list q_snp and Qr q_ref and f be a function that - * converts Q-scores to probability of error, then the probability ratio - * is given by - * - * Product_{i=1}^{|Qs|}((1-f(Qs[i])/2n+(2n-1)f(Qs[i])/2n)*Product_{j=1}^{|Qr|}(((2n-1)(1-f(Qr[j]))+f(Qr[j]))/2n) - * ------------------------------------------------------------------------------------------------------------ - * Product_{i=1}^{|Qs|}f(Qs[i])*Product_{j=1}^{|Qr|}(1-f(Qr[j])) - * - * since depth = |Qr|+|Qs| the binomial coefficients cancel so are not present in this formula - * - * - * - * - * the "LOD" used here is really a log-likelihood, just the logarithm of the above - * - * Helper functions compute all but the final sum (log of mult/div becomes add/subtract) - * - */ - - Pair logsumSNP = qListToSumLogProbabilities(true,q_snp); - Pair logsumRef = qListToSumLogProbabilities(false,q_ref); - - return 0 - logsumSNP.first - logsumRef.first + logsumSNP.second + logsumRef.second; + return 0.0 - logsumSNP.first - logsumRef.first + logsumSNP.second + logsumRef.second; } + public static List getQList(List reads, List offsets) { + List qscores = new LinkedList(); + for(int readNo = 0; readNo < reads.size(); readNo++) { + qscores.add(reads.get(readNo).getBaseQualities()[offsets.get(readNo)]); + } - public Pair qListToSumLogProbabilities(boolean listRepresentsSNPObservations, List qList) + return qscores; + } + + public static Pair qListToSumLogProbabilities(boolean listRepresentsSNPObservations, List qList, double denom) { double logProbObserveXAndSNPTrue = 0; // note "error" for SNP is observing a ref double logProbObserveXAndRefTrue = 0;// and "error" for ref is observing a SNP - final double denom = 2*((double)num_individuals); for (byte qual : qList) { double p_err = QualityUtils.qualToErrorProb(qual); @@ -236,6 +217,4 @@ public class CoverageAndPowerWalker extends LocusWalker(logProbObserveXAndSNPTrue,logProbObserveXAndRefTrue); } - - } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java b/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java new file mode 100755 index 000000000..b87f89b7f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java @@ -0,0 +1,52 @@ +package org.broadinstitute.sting.playground.utils; + +import net.sf.samtools.SAMRecord; + +import java.util.List; +import java.util.ArrayList; + +import org.broadinstitute.sting.utils.Pair; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Aug 27, 2009 + * Time: 12:31:08 PM + * To change this template use File | Settings | File Templates. + */ +public class PoolUtils { + + private PoolUtils () {} + + public static Pair, List>,Pair,List>> splitReadsByReadDirection(List reads, List offsets) { + ArrayList forwardReads; + ArrayList reverseReads; + ArrayList forwardOffsets; + ArrayList reverseOffsets; + + if ( reads == null) { + forwardReads = null; + reverseReads = null; + forwardOffsets = null; + reverseOffsets = null; + } else { + forwardReads = new ArrayList(); + reverseReads = new ArrayList(); + forwardOffsets = new ArrayList(); + reverseOffsets = new ArrayList(); + + for ( int readNo = 0; readNo < reads.size(); readNo ++ ) { + if ( reads.get(readNo).getReadNegativeStrandFlag() ) { + forwardReads.add(reads.get(readNo)); + forwardOffsets.add(offsets.get(readNo)); + } else { + reverseReads.add(reads.get(readNo)); + reverseOffsets.add(offsets.get(readNo)); + } + } + } + + return new Pair(new Pair(forwardReads,reverseReads), new Pair(forwardOffsets,reverseOffsets)); + } + +} diff --git a/java/src/org/broadinstitute/sting/utils/ListUtils.java b/java/src/org/broadinstitute/sting/utils/ListUtils.java index 7ad03a03a..15790c495 100644 --- a/java/src/org/broadinstitute/sting/utils/ListUtils.java +++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java @@ -110,7 +110,10 @@ public class ListUtils { // list index maps to a q-score only through the offset index // returns the kth-largest q-score. - + if( reads.size() == 0) { + return 0; + } + ArrayList lessThanQReads = new ArrayList(); ArrayList equalToQReads = new ArrayList(); ArrayList greaterThanQReads = new ArrayList(); diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 4f48b1e61..2aa2c44c0 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -169,6 +169,23 @@ public class MathUtils { // the sum of log(i) from i = (1+(50*a)) to i = 50+(50*b) to increase performance on binomial coefficients // which may require many sums. } + + /** + * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. + * @param start - start of the cumulant sum (over hits) + * @param end - end of the cumulant sum (over hits) + * @param total - number of attempts for the number of hits + * @param probHit - probability of a successful hit + * @return - returns the cumulative probability + */ + public static double cumBinomialProbLog(int start, int end, int total, double probHit) { + double cumProb = 0.0; + for(int hits = start; hits < end; hits++) { + cumProb += binomialProbabilityLog(hits, total, probHit); + } + + return cumProb; + } /** * Computes a multinomial. This is computed using the formula