From 92ea947c33a3b37329eec2b262ebbefc29358699 Mon Sep 17 00:00:00 2001 From: chartl Date: Tue, 25 Aug 2009 21:27:50 +0000 Subject: [PATCH] Added binomProbabilityLog(int k, int n, double p) to MathUtils: binomialProbabilityLog uses a log-space calculation of the binomial pmf to avoid the coefficient blowing up and thus returning Infinity or NaN (or in some very strange cases -Infinity). The log calculation compares very well, it seems with our current method. It's in MathUtils but could stand testing against rigorous truth data before becoming standard. Added median calculator functions to ListUtils getQScoreMedian is a new utility I wrote that given reads and offsets will find the median Q score. While I was at it, I wrote a similar method, getMedian, which will return the median of any list of Comparables, independent of initial order. These are in ListUtils. Added a new poolseq directory and three walkers CoverageAndPowerWalker is built on top of the PrintCoverage walker and prints out the power to detect a mutant allele in a pool of 2*(number of individuals in the pool) alleles. It can be flagged either to do this by boostrapping, or by pure math with a probability of error based on the median Q-score. This walker compiles, runs, and gives quite reasonable outputs that compare visually well to the power calculation computed by Syzygy. ArtificialPoolWalker is designed to take multiple single-sample .bam files and create a (random) artificial pool. The coverage of that pool is a user-defined proportion of the total coverage over all of the input files. The output is not only a new .bam file, but also an auxiliary file that has for each locus, the genotype of the individuals, the confidence of that call, and that person's representation in the artificial pool .bam at that locus. This walker compiles and, uhh, looks pretty. Needs some testing. AnalyzePowerWalker extends CoverageAndPowerWalker so that it can read previous power calcuations (e.g. from Syzygy) and print them to the output file as well for direct downstream comparisons. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1460 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/poolseq/AnalyzePowerWalker.java | 104 +++++ .../walkers/poolseq/ArtificialPoolWalker.java | 358 ++++++++++++++++++ .../poolseq/CoverageAndPowerWalker.java | 262 +++++++++++++ .../broadinstitute/sting/utils/ListUtils.java | 80 +++- .../broadinstitute/sting/utils/MathUtils.java | 52 +++ 5 files changed, 855 insertions(+), 1 deletion(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java new file mode 100755 index 000000000..e9a57a6a9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java @@ -0,0 +1,104 @@ +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; + +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; + +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.BufferedReader; +import java.io.IOException; +import java.util.StringTokenizer; +import java.util.NoSuchElementException; +import java.rmi.NoSuchObjectException; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Aug 25, 2009 + * Time: 3:47:47 PM + * To change this template use File | Settings | File Templates. + */ +public class AnalyzePowerWalker extends CoverageAndPowerWalker{ + // runs CoverageAndPowerWalker except compares to Syzygy outputs in a file + + @Argument(fullName = "SyzygyOutputFile", shortName = "sof", doc="Syzygy output file to compare to", required = true) + String pathToSyzygyFile = null; + @Argument(fullName = "ColumnOffset", shortName = "co", doc = "Offset of column containing the power in the pf", required = true) + int colOffset = 0; + + BufferedReader syzyFileReader; + final String pfFileDelimiter = " "; + boolean outOfLinesInSyzyFile = false; + + @Override + public void initialize() + { + super.initialize(); + try { + syzyFileReader = new BufferedReader(new FileReader(pathToSyzygyFile)); + syzyFileReader.readLine(); + } catch (FileNotFoundException e) { + String newErrMsg = "Syzygy input file " + pathToSyzygyFile + " could be incorrect. File not found."; + throw new StingException(newErrMsg,e); + } catch (IOException e) { + String newErrMsg = "Syzygy input file error: could not read first line of "+pathToSyzygyFile; + throw new StingException(newErrMsg,e); + } + + } + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) + { + if ( !super.suppress_printing ) + { + Pair powpair = super.boostrapSamplingPowerCalc(context); + + boolean syzyFileIsReady; + try { + syzyFileIsReady = syzyFileReader.ready(); + } + catch(IOException e) { + syzyFileIsReady = false; + } + + if(!syzyFileIsReady) { + throw new StingException("Input file reader was not ready before an attempt to read from it."); + } else if(!outOfLinesInSyzyFile) { + double syzyPow = getSyzyPowFromFile(); + out.printf("%s: %d %d %f %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first,syzyPow); + } else { + out.printf("%s: %d %d %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first); + } + } + + return context.getReads().size(); + } + + public double getSyzyPowFromFile() { + String thisLine = null; + try { + thisLine = syzyFileReader.readLine(); + } catch(IOException e) { + String newErrMsg = "Ran out of lines in the syzyfile; further output of Syzygy power will be suppressed."; + outOfLinesInSyzyFile=true; + logger.warn(newErrMsg + " " + e.toString()); + return -1.1; + } + + StringTokenizer lineTokenizer = new StringTokenizer(thisLine, pfFileDelimiter); + try { + for(int j = 0; j < colOffset; j++) { + lineTokenizer.nextToken(); + } + return (Double.valueOf(lineTokenizer.nextToken())/100.0); + } catch (NoSuchElementException e) { + String errMsg = "The given column offset for the pool, " + colOffset + " exceeded the number of entries in the file " + pathToSyzygyFile; + throw new StingException(errMsg); + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java new file mode 100755 index 000000000..ebfbc2d44 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * TERMS OF USE: WE ARE NOT LIABLE FOR ANYTHING + */ + + +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.genotype.GenotypeCall; + +import java.util.*; +import java.io.FileOutputStream; +import java.io.PrintWriter; +import java.io.FileNotFoundException; + +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMRecord; + +// Display the depth of coverage at a given locus and output the power + +public class ArtificialPoolWalker extends LocusWalker[], SAMFileWriter> { + @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) + public boolean suppressPrinting = false; + @Argument(fullName="reductionProportion", shortName="rp", doc="Proportion of total coverage reflected in pool file",required=false) + public double red_prop = -1; + @Argument(fullName="auxOutputFile", shortName="af", doc="Filepath for Auxiliary pool information (true genotypes, etc.)", required=true) + public String auxFilepath = null; + @Argument(fullName="outputBamFile",shortName="of",doc="Output to this file rather than to standard output") + SAMFileWriter outputBamFile = null; + + private List> readGroupSets; + private PrintWriter auxWrite; + private Pair[] local_genotypes; + private LinkedList[] living_reads; + private SingleSampleGenotyper ssg; + private int npeople; + //@param local_genotypes - holds the genotype (A A/ A C/ etc) for each individual. Updates at each locus. + //@param auxWrite - the writer to the auxiliary file + //@param readGroupSets : holds the readgroups (identifiers for individuals from each read) + //@param ssg - the instance of the single sample genotyper + //@param living_reads - holds on to the selected reads from each person until the location passes the read endpoint + // Updates at each locus. + //@param npeople - number of people represented in this pool (size of readGroupSets) + + + public void initialize() { + // initialize the output writer + try { + auxWrite = new PrintWriter(new FileOutputStream(auxFilepath)); + } + catch(FileNotFoundException e) { + String ermsg = "auxiliary file for Artificial Pool Walker was either a folder or could not be created"; + throw new StingException(ermsg, e); + } + + // create the file header and write to file + + auxWrite.printf("%s%n",createFileHeader()); + + // obtain the readgroup sets + + readGroupSets = getToolkit().getMergedReadGroupsByReaders(); + npeople = readGroupSets.size(); + + // if a reduction proportion was unspecified, set it to 1/num_files + + if(red_prop <= 0) { + red_prop = 1.0/npeople; + } else { + // do nothing muhahaha + } + + // initialize the local genotype array + + local_genotypes = new Pair[npeople]; + + // initilize the single sample genotyper + + ssg = new SingleSampleGenotyper(); + ssg.initialize(); + + // initialize the living reads array + + living_reads = new LinkedList[npeople]; + + for(int iter=0; iter < readGroupSets.size(); iter++) { + living_reads[iter] = new LinkedList(); + } + } + + public List[] map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + updateLiving(); + // each time we move to the next locus, remove from the coverage count those reads that ended + auxWrite.printf("%s:%s",context.getContig(),context.getPosition()); + + return getNewReadsAndGenotypesByGroup(tracker, ref, context); + } + + public SAMFileWriter reduceInit() { + + return outputBamFile; + } + + public SAMFileWriter reduce(List[] readsByReadGroup, SAMFileWriter outFile) { + // want to generate a sub sample of the new reads + int total_new_coverage = 0; + + for(int pers = 0; pers < npeople; pers++) { + total_new_coverage += readsByReadGroup[pers].size(); + } + + int sought_coverage = (int) Math.ceil(red_prop * (double)total_new_coverage); + List[] randomReadsByGroup = drawReadsRandomlyFromReadsByGroup(readsByReadGroup,sought_coverage); + printToFileAndAuxFile(randomReadsByGroup,sought_coverage,outFile); + return outFile; + + } + + + + // ArtificalPoolWalker helper methods begin below + + private List[] drawReadsRandomlyFromReadsByGroup(List[] readsByReadGroup, int numToDraw){ + + int[] simulatedCoverage = simulateRandomUniformDraws(numToDraw, readsByReadGroup); + + return removeReadsToCoverage(readsByReadGroup,simulatedCoverage); + } + + private List[] removeReadsToCoverage(List[] readsByReadGroup, int[] simulatedCoverage) { + for(int group = 0; group < npeople; group++) { + int cvgAtGroup = readsByReadGroup[group].size(); + for(int drawDown = simulatedCoverage[group]; drawDown > 0; drawDown --) { + readsByReadGroup[group].remove((int)Math.floor((double) cvgAtGroup*Math.random())); + cvgAtGroup--; + } + } + + return readsByReadGroup; + } + + private int[] simulateRandomUniformDraws(int numToDraw, List[] readsByReadGroup) { + int[] drawsByPerson = new int[npeople]; + + for(int pers = 0; pers < npeople; pers++) { //array instantiation + drawsByPerson[pers] = 0; + } + + for(int draw = 0; draw < numToDraw; draw++) { + int personToDrawFrom = (int) Math.floor((double) npeople * Math.random()); + drawsByPerson[personToDrawFrom]++; + } + + return redistributeSimulatedCoverage(drawsByPerson,readsByReadGroup,0, 0, null); + } + + private int[] redistributeSimulatedCoverage(int[] drawsByPerson, List[] readsByReadGroup, int excess, int nOffLimits, boolean[] offLimits) { + int [] result; + if(offLimits == null) { // first time calling + offLimits = new boolean[npeople]; + for(int i = 0; i < npeople; i++) { + offLimits[i] = false; + } + result = redistributeSimulatedCoverage(drawsByPerson,readsByReadGroup,excess,0, offLimits); + } else if (excess == 0) { // no overt excess, need to check individual persons + // get excess coverage + int newExcess = 0; + + for(int pers = 0; pers < npeople; pers++) { + if(!offLimits[pers] && drawsByPerson[pers] > readsByReadGroup[pers].size()) { + newExcess += drawsByPerson[pers] - readsByReadGroup[pers].size(); + drawsByPerson[pers] = readsByReadGroup[pers].size(); + offLimits[pers] = true; + nOffLimits++; + } else { + // do nothing + } + } + + if(newExcess == 0) { // we are done! + result = drawsByPerson; + } else { // there is now overt excess + result = redistributeSimulatedCoverage(drawsByPerson,readsByReadGroup,newExcess,nOffLimits,offLimits); + } + } else { // there are overt excess reads to distribute + int nRemaining = npeople - nOffLimits; + int[] drawsToAdd = new int[nRemaining]; + + for(int j = 0; j < nRemaining; j++) { + drawsToAdd[j] = 0; + } + + for(int draw = excess; draw > 0; draw--) { + drawsToAdd[(int) Math.floor((double)nRemaining * Math.random())]++; + } + + int notOffLimitsCounter = 0; + + for(int pers = 0; pers < npeople; pers++) { + if(! offLimits[pers]) { + drawsByPerson[pers] += drawsToAdd[notOffLimitsCounter]; + notOffLimitsCounter++; + } else { + // do nothing + } + } + result = redistributeSimulatedCoverage(drawsByPerson,readsByReadGroup,0,nOffLimits,offLimits); + } + return result; + } + + private int[] createNewPersonIndex(int peopleWithReadsLeft, int[] indexLeft, int indexToRemoveFrom) { + int[] newPersonIndex = new int[peopleWithReadsLeft-1]; + for(int i = 0; i < peopleWithReadsLeft; i++) { + if(i < indexToRemoveFrom) { + newPersonIndex[i] = indexLeft[i]; + } else { + newPersonIndex[i] = indexLeft[i+1]; + } + } + return newPersonIndex; + } + + + private List[] getNewReadsAndGenotypesByGroup(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) + { + // at the same time we will: + // 1) split reads up by read group + // 2) get new reads + // 3) get genotypes for each individual + // then we will: + // return the new reads (a subsample), stratified by read group + + List allReads = context.getReads(); + int n_reads_total = allReads.size(); + List allOffsets = context.getOffsets(); + ListIterator readIterator = allReads.listIterator(); + ListIterator offsetIterator = allOffsets.listIterator(); + LinkedList[] reads_by_group = new LinkedList[npeople]; + LinkedList[] offsets_by_group = new LinkedList[npeople]; + LinkedList[] new_reads_by_group = new LinkedList[npeople]; + + for(int j = 0; j < npeople; j++) { + reads_by_group[j] = new LinkedList(); + offsets_by_group[j] = new LinkedList(); + new_reads_by_group[j] = new LinkedList(); + } + + while(readIterator.hasNext()) { + int iter_person = 0; + SAMRecord read = (SAMRecord) readIterator.next(); + int offset = (Integer) offsetIterator.next(); + while(!readGroupSets.get(iter_person).contains((String) read.getAttribute("RG"))) { + iter_person++; + // for debug purposes only: + if(iter_person > npeople) { + throw new StingException("Read not found in any read group in ArtificialPoolWalker. Life sucks."); + } + } + // here, iter_person is the person from whom the read came so: + reads_by_group[iter_person].add(read); + offsets_by_group[iter_person].add(offset); + + // check to see if the read is new + if(offset == 0) { + new_reads_by_group[iter_person].add(read); + // add this to the new reads set to be randomly selected and printed + } + } + // now we obtain the genotypes + for(int iter=0; iter[] downSampledReadsByGroup, int cvg, SAMFileWriter outFile) + { + // NOTE: the chromosome and position were already printed in the auxFile during map. + for(int pers = 0; pers < npeople; pers++) { + List personReads = downSampledReadsByGroup[pers]; + for(SAMRecord read : personReads) { + outFile.addAlignment(read); + living_reads[pers].add(read.getReadLength()); + } + } + String auxformstring = " %s %f %d"; + // now print to the aux file + int totalcvg = 0; + for(int pers = 0; pers < npeople; pers++) { + auxWrite.printf(auxformstring, local_genotypes[pers].first, local_genotypes[pers].second,living_reads[pers].size()); + totalcvg += living_reads[pers].size(); + } + auxWrite.printf(" %d%n", totalcvg); + + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java new file mode 100755 index 000000000..61db70583 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java @@ -0,0 +1,262 @@ +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; + +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.ListUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import net.sf.samtools.SAMRecord; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Aug 18, 2009 + * Time: 11:57:07 AM + * To change this template use File | Settings | File Templates. + */ +public class CoverageAndPowerWalker extends LocusWalker> { + @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) + public boolean suppress_printing = false; + + @Argument(fullName="poolSize", shortName="ps", doc="Number of individuals in pool", required=true) + public int num_individuals = 0; + + @Argument(fullName="bootStrap", shortName="bs", doc="Use a bootstrap method", required=false) + public boolean use_bootstrap = false; + + @Argument(fullName="lodThreshold", shortName="lt", doc="Threshold for LOD score for calls") + public double threshold = 3.0; + + + + private static final int BOOTSTRAP_ITERATIONS = 300; + + + @Override + public void initialize() + { + + if(num_individuals <= 0) + throw new IllegalArgumentException("Positive nonzero parameter expected for poolSize"); + } + + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) + { + if ( !suppress_printing ) + { + Pair powpair = boostrapSamplingPowerCalc(context); + out.printf("%s: %d %d %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first); + } + + return context.getReads().size(); + } + + + public boolean isReduceByInterval() { + return true; + } + + + public Pair reduceInit() { + + return new Pair(0l,0l); } + + + public Pair reduce(Integer value, Pair sum) + { + long left = value.longValue() + sum.getFirst(); + long right = sum.getSecond() + 1l; + return new Pair(left, right); + } + + + public void onTraversalDone(Pair result) { + out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n", + ((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond()); + } + + /* + * + * Helper Methods Below + * + */ + + public Pair powerTheoretical(int depth, List reads, List offsets, double snp_prop) + { + // get the median Q score for these reads + + byte medQ = ListUtils.getQScoreMedian(reads,offsets); + // System.out.println("Median Q: " + medQ); //TODO: remove this line + double p_error = QualityUtils.qualToErrorProb(medQ); + + // variable names from here on out come from the mathematics of a log likelihood test + // with binomial probabilities, of observing k bases consistent with the same SNP + // given that SNP occurs in one of the alleles, versus it does not occur. + // Numerator and denominator will each have a term raised to the kth power + // and a term raised to the (depth - kth) (or d-kth) power. Thus the names. + // Taking a log then brings those powers out as coefficients, where they can be solved for. + + + double kterm_num = Math.log10(snp_prop * (1 - p_error) + (1 - snp_prop) * (p_error/3)); + double kterm_denom = Math.log10(p_error/3); + double dkterm_num = Math.log10(snp_prop*(p_error/3) + (1 - snp_prop) * (1 - p_error)); + double dkterm_denom = Math.log10(1 - p_error); + + int kaccept = (int) Math.ceil((threshold-((double)depth)*(dkterm_num-dkterm_denom))/(kterm_num-kterm_denom-dkterm_num+dkterm_denom)); + + // we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs + // we can optimize this by checking to see which sum is smaller + + double pow = 0; + + if(depth - kaccept < kaccept) {// kaccept > depth/2 - calculate power as P[hits between k and depth] + + for(int k = kaccept; k < depth; k++) { + + pow += MathUtils.binomialProbabilityLog(k, depth, snp_prop); + } + + return new Pair(pow,medQ); + } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and k] + for(int k = 0; k < kaccept; k++) { + pow += MathUtils.binomialProbabilityLog(k,depth,snp_prop); + } + + return new Pair(1.0-pow,medQ); + } + } + + + public Pair boostrapSamplingPowerCalc(AlignmentContext context) + { + /* it is assumed that there are 2*num_individuals chromosomes in the pool + * we use a bootstrap method to calculate our power to detect a SNP with this + * depth of coverage + * Assume that only one of the individual chromosomes has a variant at this locus + * + */ + + List reads = context.getReads(); + List offsets = context.getOffsets(); + + final int depth = reads.size(); + Pair result; + final double single_snip_proportion = 1/(2.0*num_individuals); + + if (depth <= 0) { + result = new Pair(-1,-1); + } else if (!use_bootstrap) { // object data from command line + + result = powerTheoretical(depth, reads, offsets, single_snip_proportion); + } else { + + // + // otherwise, bootstrapping occurs below + // + + int hypothesis_rejections=0; + + for(int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++) + { + List qscore_SNPs = new LinkedList(); + List qscore_refs = new LinkedList(); + + for (int strap = 0; strap < depth; strap++) { + int readOffset = randomlySelectRead(depth); + byte baseQuality = reads.get(readOffset).getBaseQualities()[offsets.get(readOffset)]; + + if (Math.random() < single_snip_proportion) {// occurs with probability "single_snip_proportion" + qscore_SNPs.add(baseQuality); + } + else { // simulating a reference read + qscore_refs.add(baseQuality); + } + } + + if (calculateLODByQLists(qscore_SNPs,qscore_refs) >= threshold) { + hypothesis_rejections++; + } + + } + + result = new Pair(((double)hypothesis_rejections)/BOOTSTRAP_ITERATIONS, ListUtils.getQScoreMedian(reads,offsets)); + } + + return result; + + } + + + public int randomlySelectRead(int depth) + { + double qscore_selector = Math.random(); + int readspositionrandom; + for(readspositionrandom = 1; readspositionrandom < ((double)depth * qscore_selector); readspositionrandom ++) { + if(readspositionrandom > depth + 1) { + throw new RuntimeException("qscore iterator exceeding possible thresholds"); + } + } + + return readspositionrandom - 1; + } + + + public double calculateLODByQLists(List q_snp, List q_ref) + { + /* LOD score calculation + * let Qs be the list q_snp and Qr q_ref and f be a function that + * converts Q-scores to probability of error, then the probability ratio + * is given by + * + * Product_{i=1}^{|Qs|}((1-f(Qs[i])/2n+(2n-1)f(Qs[i])/2n)*Product_{j=1}^{|Qr|}(((2n-1)(1-f(Qr[j]))+f(Qr[j]))/2n) + * ------------------------------------------------------------------------------------------------------------ + * Product_{i=1}^{|Qs|}f(Qs[i])*Product_{j=1}^{|Qr|}(1-f(Qr[j])) + * + * since depth = |Qr|+|Qs| the binomial coefficients cancel so are not present in this formula + * + * + * + * + * the "LOD" used here is really a log-likelihood, just the logarithm of the above + * + * Helper functions compute all but the final sum (log of mult/div becomes add/subtract) + * + */ + + Pair logsumSNP = qListToSumLogProbabilities(true,q_snp); + Pair logsumRef = qListToSumLogProbabilities(false,q_ref); + + return 0 - logsumSNP.first - logsumRef.first + logsumSNP.second + logsumRef.second; + } + + + public Pair qListToSumLogProbabilities(boolean listRepresentsSNPObservations, List qList) + { + double logProbObserveXAndSNPTrue = 0; // note "error" for SNP is observing a ref + double logProbObserveXAndRefTrue = 0;// and "error" for ref is observing a SNP + final double denom = 2*((double)num_individuals); + + for (byte qual : qList) { + double p_err = QualityUtils.qualToErrorProb(qual); + if (listRepresentsSNPObservations) { + logProbObserveXAndSNPTrue += Math.log10((1 - p_err) / denom +((denom - 1)*p_err) / denom); + logProbObserveXAndRefTrue += Math.log10(p_err); + } else { + logProbObserveXAndSNPTrue += Math.log10((denom - 1) * (1 - p_err)/denom + p_err/denom); + logProbObserveXAndRefTrue+= Math.log10(1 -p_err); + } + } + + return new Pair(logProbObserveXAndSNPTrue,logProbObserveXAndRefTrue); + } + + + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/ListUtils.java b/java/src/org/broadinstitute/sting/utils/ListUtils.java index b79b2ab0a..7ad03a03a 100644 --- a/java/src/org/broadinstitute/sting/utils/ListUtils.java +++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils; import java.util.*; +import net.sf.samtools.SAMRecord; + /** * Created by IntelliJ IDEA. * User: andrew @@ -66,9 +68,85 @@ public class ListUtils { for (int i : indices) { subset.add(list.get(i)); } - + return subset; } + public static Comparable orderStatisticSearch(int orderStat, List list) { + // this finds the order statistic of the list (kth largest element) + // the list is assumed *not* to be sorted + + final Comparable x = list.get(orderStat); + ListIterator iterator = list.listIterator(); + ArrayList lessThanX = new ArrayList(); + ArrayList equalToX = new ArrayList(); + ArrayList greaterThanX = new ArrayList(); + + for(Comparable y : list) { + if(x.compareTo(y) > 0) { + lessThanX.add(y); + } else if(x.compareTo(y) < 0) { + greaterThanX.add(y); + } else + equalToX.add(y); + } + + if(lessThanX.size() > orderStat) + return orderStatisticSearch(orderStat, lessThanX); + else if(lessThanX.size() + equalToX.size() >= orderStat) + return orderStat; + else + return orderStatisticSearch(orderStat - lessThanX.size() - equalToX.size(), greaterThanX); + + } + + + public static Object getMedian(List list) { + return orderStatisticSearch((int) Math.ceil(list.size()/2), list); + } + + public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { + // version of the order statistic calculator for SAMRecord/Integer lists, where the + // list index maps to a q-score only through the offset index + // returns the kth-largest q-score. + + + ArrayList lessThanQReads = new ArrayList(); + ArrayList equalToQReads = new ArrayList(); + ArrayList greaterThanQReads = new ArrayList(); + ArrayList lessThanQOffsets = new ArrayList(); + ArrayList greaterThanQOffsets = new ArrayList(); + + final byte qk = reads.get(k).getBaseQualities()[offsets.get(k)]; + + for(int iter = 0; iter < reads.size(); iter ++) { + SAMRecord read = reads.get(iter); + int offset = offsets.get(iter); + byte quality = read.getBaseQualities()[offset]; + + if(quality < qk) { + lessThanQReads.add(read); + lessThanQOffsets.add(offset); + } else if(quality > qk) { + greaterThanQReads.add(read); + greaterThanQOffsets.add(offset); + } else { + equalToQReads.add(reads.get(iter)); + } + } + + if(lessThanQReads.size() > k) + return getQScoreOrderStatistic(lessThanQReads, lessThanQOffsets, k); + else if(equalToQReads.size() + lessThanQReads.size() >= k) + return qk; + else + return getQScoreOrderStatistic(greaterThanQReads, greaterThanQOffsets, k - lessThanQReads.size() - equalToQReads.size()); + + } + + + public static byte getQScoreMedian(List reads, List offsets) { + return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); + } } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 0edd6427c..4f48b1e61 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -8,6 +8,17 @@ import cern.jet.math.Arithmetic; * @author Kiran Garimella */ public class MathUtils { + + /** Public constants - used for the Lanczos approximation to the factorial function + * (for the calculation of the binomial/multinomial probability in logspace) + * @param LANC_SEQ[] - an array holding the constants which correspond to the product + * of Chebyshev Polynomial coefficients, and points on the Gamma function (for interpolation) + * [see A Precision Approximation of the Gamma Function J. SIAM Numer. Anal. Ser. B, Vol. 1 1964. pp. 86-96] + * @param LANC_G - a value for the Lanczos approximation to the gamma function that works to + * high precision + */ + + /** Private constructor. No instantiating this class! */ private MathUtils() {} @@ -120,6 +131,45 @@ public class MathUtils { //return (new cern.jet.random.Binomial(n, p, cern.jet.random.engine.RandomEngine.makeDefault())).pdf(k); } + /** + * Performs the calculation for a binomial probability in logspace, preventing the blowups of the binomial coefficient + * consistent with moderately large n and k. + * + * @param k - number of successes/observations + * @param n - number of Bernoulli trials + * @param p - probability of success/observations + * + * @return the probability mass associated with a binomial mass function for the above configuration + */ + public static double binomialProbabilityLog(int k, int n, double p) { + + double log_coef = 0.0; + int min; + int max; + + if(k < n - k) { + min = k; + max = n-k; + } else { + min = n - k; + max = k; + } + + for(int i=2; i <= min; i++) { + log_coef -= Math.log((double)i); + } + + for(int i = max+1; i <= n; i++) { + log_coef += Math.log((double)i); + } + + return Math.exp(log_coef + ((double)k)*Math.log(p) + ((double)(n-k))*Math.log(1-p)); + + // in the future, we may want to precompile a table of exact values, where the entry (a,b) indicates + // the sum of log(i) from i = (1+(50*a)) to i = 50+(50*b) to increase performance on binomial coefficients + // which may require many sums. + } + /** * Computes a multinomial. This is computed using the formula * @@ -194,4 +244,6 @@ public class MathUtils { rms /= x.length; return Math.sqrt(rms); } + + }