@ListUtils - Bugfix in getQScoreOrderStatistic: method would attempt to access an empty list fed into it. Now it checks for null pointers and returns 0.

@MathUtils - added a new method: cumBinomialProbLog which calculates a cumulant from any start point to any end point using the BinomProbabilityLog calculation.

@PoolUtils - added a new utility class specifically for items related to pooled sequencing. A major part of the power calculation is now to calculate powers
             independently by read direction. The only method in this class (currently) takes your reads and offsets, and splits them into two groups
             by read direction.

@CoverageAndPowerWalker - completely rewritten to split coverage, median qualities, and power by read direction. Makes use of cumBinomialProbLog rather than
                          doing that calculation within the object itself.




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1462 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-08-27 19:31:53 +00:00
parent 1da45cffb3
commit 8740124cda
4 changed files with 188 additions and 137 deletions

View File

@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.playground.utils.PoolUtils;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import java.util.*; import java.util.*;
@ -20,7 +21,7 @@ import java.util.*;
* Time: 11:57:07 AM * Time: 11:57:07 AM
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
public class CoverageAndPowerWalker extends LocusWalker<Integer, Pair<Long, Long>> { public class CoverageAndPowerWalker extends LocusWalker<Pair<Integer, Integer>, Pair<Long, Long>> {
@Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false)
public boolean suppress_printing = false; public boolean suppress_printing = false;
@ -28,7 +29,7 @@ public class CoverageAndPowerWalker extends LocusWalker<Integer, Pair<Long, Long
public int num_individuals = 0; public int num_individuals = 0;
@Argument(fullName="bootStrap", shortName="bs", doc="Use a bootstrap method", required=false) @Argument(fullName="bootStrap", shortName="bs", doc="Use a bootstrap method", required=false)
public boolean use_bootstrap = false; public boolean useBootstrap = false;
@Argument(fullName="lodThreshold", shortName="lt", doc="Threshold for LOD score for calls") @Argument(fullName="lodThreshold", shortName="lt", doc="Threshold for LOD score for calls")
public double threshold = 3.0; public double threshold = 3.0;
@ -41,55 +42,66 @@ public class CoverageAndPowerWalker extends LocusWalker<Integer, Pair<Long, Long
throw new IllegalArgumentException("Positive nonzero parameter expected for poolSize"); throw new IllegalArgumentException("Positive nonzero parameter expected for poolSize");
} }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context)
{
if ( !suppress_printing )
{
Pair<Double,Byte> powpair = boostrapSamplingPowerCalc(context);
out.printf("%s: %d %d %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first);
}
return context.getReads().size();
}
public boolean isReduceByInterval() {
return true;
}
public Pair<Long,Long> reduceInit() { public Pair<Long,Long> reduceInit() {
return new Pair(0l,0l);
return new Pair<Long,Long>(0l,0l); }
public Pair<Long, Long> reduce(Integer value, Pair<Long, Long> sum)
{
long left = value.longValue() + sum.getFirst();
long right = sum.getSecond() + 1l;
return new Pair<Long,Long>(left, right);
} }
public Pair<Long,Long> reduce(Pair<Integer,Integer> newCvg, Pair<Long,Long> prevCvg) {
public void onTraversalDone(Pair<Long, Long> result) { return new Pair(prevCvg.first + newCvg.first, prevCvg.second + newCvg.second);
out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n",
((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond());
} }
/* public Pair<Integer,Integer> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
* Pair<Pair<List<SAMRecord>,List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> readsByDirection = PoolUtils.splitReadsByReadDirection(context.getReads(),context.getOffsets());
* Helper Methods Below if ( ! suppress_printing) {
* Pair<double[],byte[]> powers = calculatePower(readsByDirection, useBootstrap, context);
*/ out.printf("%s: %d %d %d %d %d %d %f %f %f%n", context.getLocation(), readsByDirection.getFirst().getFirst().size(), readsByDirection.getFirst().getSecond().size(),
context.getReads().size(), powers.getSecond()[0], powers.getSecond()[1], powers.getSecond()[2],
powers.getFirst()[0], powers.getFirst()[1], powers.getFirst()[2]);
}
return new Pair(readsByDirection.getFirst().getFirst().size(),readsByDirection.getFirst().getSecond().size());
}
public Pair<Double,Byte> powerTheoretical(int depth, List<SAMRecord> reads, List<Integer> offsets, double snp_prop) // helper methods
{
// get the median Q score for these reads
byte medQ = ListUtils.getQScoreMedian(reads,offsets); public Pair<double[],byte[]> calculatePower(Pair<Pair<List<SAMRecord>,List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> dirReads, boolean bootStrap, AlignmentContext context) {
// System.out.println("Median Q: " + medQ); //TODO: remove this line Pair<double[],byte[]> powerAndMedianQ;
if( bootStrap ) {
powerAndMedianQ = calculateDirectionalPowersByBootstrap(dirReads,threshold,context);
} else {
powerAndMedianQ = calculateDirectionalPowersTheoretical(dirReads,threshold,context);
}
return powerAndMedianQ;
}
public Pair<double[],byte[]> calculateDirectionalPowersTheoretical(Pair<Pair<List<SAMRecord>,List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> dirReads, double thresh, AlignmentContext context) {
byte[] medQs = getMedianQScores(dirReads, context);
double powerForward = calculatePowerTheoretical(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst(),medQs[0],thresh);
double powerReverse = calculatePowerTheoretical(dirReads.getFirst().getSecond(), dirReads.getSecond().getSecond(),medQs[1],thresh);
double powerCombined = calculatePowerTheoretical(context.getReads(),context.getOffsets(),medQs[2],thresh);
double [] powers = {powerForward,powerReverse,powerCombined};
return new Pair(powers, medQs);
}
public byte[] getMedianQScores(Pair<Pair<List<SAMRecord>,List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> dirReads, AlignmentContext context) {
byte medQForward = getMedianQualityScore(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst());
byte medQReverse = getMedianQualityScore(dirReads.getFirst().getSecond(), dirReads.getSecond().getSecond());
byte medQCombined = getMedianQualityScore(context.getReads(),context.getOffsets());
byte [] medQs = {medQForward, medQReverse, medQCombined};
return medQs;
}
public byte getMedianQualityScore(List<SAMRecord> reads, List<Integer> offsets) {
return ListUtils.getQScoreMedian(reads,offsets);
}
public double calculatePowerTheoretical(List<SAMRecord> reads, List<Integer> offsets, byte medQ, double thresh) {
double p_error = QualityUtils.qualToErrorProb(medQ); double p_error = QualityUtils.qualToErrorProb(medQ);
int depth = reads.size();
if(depth <= 0) {
return 0;
}
double snpProp = this.getSNPProportion(1);
// variable names from here on out come from the mathematics of a log likelihood test // variable names from here on out come from the mathematics of a log likelihood test
// with binomial probabilities, of observing k bases consistent with the same SNP // with binomial probabilities, of observing k bases consistent with the same SNP
@ -99,128 +111,97 @@ public class CoverageAndPowerWalker extends LocusWalker<Integer, Pair<Long, Long
// Taking a log then brings those powers out as coefficients, where they can be solved for. // Taking a log then brings those powers out as coefficients, where they can be solved for.
double kterm_num = Math.log10(snp_prop * (1 - p_error) + (1 - snp_prop) * (p_error/3)); double kterm_num = Math.log10( snpProp * (1 - p_error) + (1 - snpProp) * (p_error/3) );
double kterm_denom = Math.log10( p_error/3 ); double kterm_denom = Math.log10( p_error/3 );
double dkterm_num = Math.log10(snp_prop*(p_error/3) + (1 - snp_prop) * (1 - p_error)); double dkterm_num = Math.log10( snpProp * (p_error/3) + (1 - snpProp) * (1 - p_error) );
double dkterm_denom = Math.log10( 1 - p_error); double dkterm_denom = Math.log10( 1 - p_error);
int kaccept = (int) Math.ceil((threshold-((double)depth)*(dkterm_num-dkterm_denom))/(kterm_num-kterm_denom-dkterm_num+dkterm_denom)); int kaccept = (int) Math.ceil( ( thresh - ( (double) depth ) * ( dkterm_num - dkterm_denom ) )/( kterm_num - kterm_denom- dkterm_num + dkterm_denom ) );
// we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs // we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs
// we can optimize this by checking to see which sum is smaller // we can optimize this by checking to see which sum is smaller
double pow = 0; double pow = 0;
if(depth - kaccept < kaccept) {// kaccept > depth/2 - calculate power as P[hits between k and depth] if ( depth - kaccept < kaccept ) {// kaccept > depth/2 - calculate power as P[hits between kaccept and depth]
for(int k = kaccept; k < depth; k++) { return MathUtils.cumBinomialProbLog(kaccept, depth, depth, snpProp);
pow += MathUtils.binomialProbabilityLog(k, depth, snp_prop); } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and kaccept]
} return 1-MathUtils.cumBinomialProbLog(0,kaccept,depth,snpProp);
return new Pair(pow,medQ);
} else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and k]
for(int k = 0; k < kaccept; k++) {
pow += MathUtils.binomialProbabilityLog(k,depth,snp_prop);
}
return new Pair(1.0-pow,medQ);
} }
} }
public double getSNPProportion(int numSNPAlleles) {
public Pair<Double,Byte> boostrapSamplingPowerCalc(AlignmentContext context) return ((double) numSNPAlleles / (2.0 * num_individuals));
{
/* it is assumed that there are 2*num_individuals chromosomes in the pool
* we use a bootstrap method to calculate our power to detect a SNP with this
* depth of coverage
* Assume that only one of the individual chromosomes has a variant at this locus
*
*/
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
final int depth = reads.size();
Pair<Double,Byte> result;
final double single_snip_proportion = 1/(2.0*num_individuals);
if (depth <= 0) {
result = new Pair(-1,-1);
} else if (!use_bootstrap) { // object data from command line
result = powerTheoretical(depth, reads, offsets, single_snip_proportion);
} else { // otherwise, bootstrapping occurs below
int hypothesis_rejections=0;
for(int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++)
{
List<Byte> qscore_SNPs = new LinkedList<Byte>();
List<Byte> qscore_refs = new LinkedList<Byte>();
for (int strap = 0; strap < depth; strap++) {
int readOffset = randomlySelectRead(depth);
byte baseQuality = reads.get(readOffset).getBaseQualities()[offsets.get(readOffset)];
if (Math.random() < single_snip_proportion) {// occurs with probability "single_snip_proportion"
qscore_SNPs.add(baseQuality);
} }
else { // simulating a reference read
qscore_refs.add(baseQuality); public Pair<double[],byte[]> calculateDirectionalPowersByBootstrap(Pair<Pair<List<SAMRecord>,List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> dirReads, double thresh, AlignmentContext context) {
double powerForward = bootstrapPower(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst(),thresh);
double powerReverse = bootstrapPower(dirReads.getFirst().getSecond(),dirReads.getSecond().getSecond(),thresh);
double powerCombined = bootstrapPower(context.getReads(),context.getOffsets(),thresh);
double[] powers = {powerForward, powerReverse, powerCombined};
return new Pair(powers,getMedianQScores(dirReads,context));
}
public double bootstrapPower(List<SAMRecord> reads, List<Integer> offsets, double thresh) {
double power = 0.0;
for ( int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++) {
Pair<Pair<List<SAMRecord>,List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> snpReadsAndRefReads = coinTossPartition(reads,offsets,this.getSNPProportion(1));
if( calculateLogLikelihoodOfSample(snpReadsAndRefReads, num_individuals) > thresh) {
power += 1.0/BOOTSTRAP_ITERATIONS;
} }
} }
if (calculateLODByQLists(qscore_SNPs,qscore_refs) >= threshold) { return power;
hypothesis_rejections++;
} }
// class methods
public static Pair<Pair<List<SAMRecord>,List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> coinTossPartition(List<SAMRecord> reads, List<Integer> offsets, double snpProb) {
Pair<Pair<List<SAMRecord>,List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> partitionedReads;
if( reads == null || offsets == null ) {
partitionedReads = null;
} else {
ArrayList<SAMRecord> snpReads = new ArrayList();
ArrayList<SAMRecord> refReads = new ArrayList();
ArrayList<Integer> snpOff = new ArrayList();
ArrayList<Integer> refOff = new ArrayList();
for(int readNo = 0; readNo < reads.size(); readNo++) {
if ( Math.random() < snpProb) {
snpReads.add(reads.get(readNo));
snpOff.add(offsets.get(readNo));
} else {
refReads.add(reads.get(readNo));
refOff.add(offsets.get(readNo));
}
}
partitionedReads= new Pair(new Pair(snpReads,refReads), new Pair(snpOff, refOff));
} }
result = new Pair(((double)hypothesis_rejections)/BOOTSTRAP_ITERATIONS, ListUtils.getQScoreMedian(reads,offsets)); return partitionedReads;
} }
return result; public static double calculateLogLikelihoodOfSample(Pair<Pair<List<SAMRecord>,List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> snpReadsRefReads, int nIndivids) {
List<Byte> qListSnps = getQList(snpReadsRefReads.getFirst().getFirst(),snpReadsRefReads.getSecond().getFirst());
List<Byte> qListRefs = getQList(snpReadsRefReads.getFirst().getSecond(),snpReadsRefReads.getSecond().getSecond());
Pair<Double,Double> logsumSNP = qListToSumLogProbabilities(true,qListSnps, 2.0*nIndivids);
Pair<Double,Double> logsumRef = qListToSumLogProbabilities(false,qListRefs, 2.0*nIndivids);
return 0.0 - logsumSNP.first - logsumRef.first + logsumSNP.second + logsumRef.second;
} }
public static List<Byte> getQList(List<SAMRecord> reads, List<Integer> offsets) {
public int randomlySelectRead(int depth) List<Byte> qscores = new LinkedList();
{ for(int readNo = 0; readNo < reads.size(); readNo++) {
return (int) Math.floor((double)depth * Math.random()); qscores.add(reads.get(readNo).getBaseQualities()[offsets.get(readNo)]);
} }
return qscores;
public double calculateLODByQLists(List<Byte> q_snp, List<Byte> q_ref)
{
/* LOD score calculation
* let Qs be the list q_snp and Qr q_ref and f be a function that
* converts Q-scores to probability of error, then the probability ratio
* is given by
*
* Product_{i=1}^{|Qs|}((1-f(Qs[i])/2n+(2n-1)f(Qs[i])/2n)*Product_{j=1}^{|Qr|}(((2n-1)(1-f(Qr[j]))+f(Qr[j]))/2n)
* ------------------------------------------------------------------------------------------------------------
* Product_{i=1}^{|Qs|}f(Qs[i])*Product_{j=1}^{|Qr|}(1-f(Qr[j]))
*
* since depth = |Qr|+|Qs| the binomial coefficients cancel so are not present in this formula
*
*
*
*
* the "LOD" used here is really a log-likelihood, just the logarithm of the above
*
* Helper functions compute all but the final sum (log of mult/div becomes add/subtract)
*
*/
Pair<Double,Double> logsumSNP = qListToSumLogProbabilities(true,q_snp);
Pair<Double,Double> logsumRef = qListToSumLogProbabilities(false,q_ref);
return 0 - logsumSNP.first - logsumRef.first + logsumSNP.second + logsumRef.second;
} }
public static Pair<Double,Double> qListToSumLogProbabilities(boolean listRepresentsSNPObservations, List<Byte> qList, double denom)
public Pair<Double,Double> qListToSumLogProbabilities(boolean listRepresentsSNPObservations, List<Byte> qList)
{ {
double logProbObserveXAndSNPTrue = 0; // note "error" for SNP is observing a ref double logProbObserveXAndSNPTrue = 0; // note "error" for SNP is observing a ref
double logProbObserveXAndRefTrue = 0;// and "error" for ref is observing a SNP double logProbObserveXAndRefTrue = 0;// and "error" for ref is observing a SNP
final double denom = 2*((double)num_individuals);
for (byte qual : qList) { for (byte qual : qList) {
double p_err = QualityUtils.qualToErrorProb(qual); double p_err = QualityUtils.qualToErrorProb(qual);
@ -236,6 +217,4 @@ public class CoverageAndPowerWalker extends LocusWalker<Integer, Pair<Long, Long
return new Pair<Double,Double>(logProbObserveXAndSNPTrue,logProbObserveXAndRefTrue); return new Pair<Double,Double>(logProbObserveXAndSNPTrue,logProbObserveXAndRefTrue);
} }
} }

View File

@ -0,0 +1,52 @@
package org.broadinstitute.sting.playground.utils;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.ArrayList;
import org.broadinstitute.sting.utils.Pair;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Aug 27, 2009
* Time: 12:31:08 PM
* To change this template use File | Settings | File Templates.
*/
public class PoolUtils {
private PoolUtils () {}
public static Pair<Pair<List<SAMRecord>, List<SAMRecord>>,Pair<List<Integer>,List<Integer>>> splitReadsByReadDirection(List<SAMRecord> reads, List<Integer> offsets) {
ArrayList<SAMRecord> forwardReads;
ArrayList<SAMRecord> reverseReads;
ArrayList<Integer> forwardOffsets;
ArrayList<Integer> reverseOffsets;
if ( reads == null) {
forwardReads = null;
reverseReads = null;
forwardOffsets = null;
reverseOffsets = null;
} else {
forwardReads = new ArrayList();
reverseReads = new ArrayList();
forwardOffsets = new ArrayList();
reverseOffsets = new ArrayList();
for ( int readNo = 0; readNo < reads.size(); readNo ++ ) {
if ( reads.get(readNo).getReadNegativeStrandFlag() ) {
forwardReads.add(reads.get(readNo));
forwardOffsets.add(offsets.get(readNo));
} else {
reverseReads.add(reads.get(readNo));
reverseOffsets.add(offsets.get(readNo));
}
}
}
return new Pair(new Pair(forwardReads,reverseReads), new Pair(forwardOffsets,reverseOffsets));
}
}

View File

@ -110,6 +110,9 @@ public class ListUtils {
// list index maps to a q-score only through the offset index // list index maps to a q-score only through the offset index
// returns the kth-largest q-score. // returns the kth-largest q-score.
if( reads.size() == 0) {
return 0;
}
ArrayList lessThanQReads = new ArrayList(); ArrayList lessThanQReads = new ArrayList();
ArrayList equalToQReads = new ArrayList(); ArrayList equalToQReads = new ArrayList();

View File

@ -170,6 +170,23 @@ public class MathUtils {
// which may require many sums. // which may require many sums.
} }
/**
* Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space.
* @param start - start of the cumulant sum (over hits)
* @param end - end of the cumulant sum (over hits)
* @param total - number of attempts for the number of hits
* @param probHit - probability of a successful hit
* @return - returns the cumulative probability
*/
public static double cumBinomialProbLog(int start, int end, int total, double probHit) {
double cumProb = 0.0;
for(int hits = start; hits < end; hits++) {
cumProb += binomialProbabilityLog(hits, total, probHit);
}
return cumProb;
}
/** /**
* Computes a multinomial. This is computed using the formula * Computes a multinomial. This is computed using the formula
* *