diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java new file mode 100755 index 000000000..e9a57a6a9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java @@ -0,0 +1,104 @@ +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; + +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; + +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.BufferedReader; +import java.io.IOException; +import java.util.StringTokenizer; +import java.util.NoSuchElementException; +import java.rmi.NoSuchObjectException; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Aug 25, 2009 + * Time: 3:47:47 PM + * To change this template use File | Settings | File Templates. + */ +public class AnalyzePowerWalker extends CoverageAndPowerWalker{ + // runs CoverageAndPowerWalker except compares to Syzygy outputs in a file + + @Argument(fullName = "SyzygyOutputFile", shortName = "sof", doc="Syzygy output file to compare to", required = true) + String pathToSyzygyFile = null; + @Argument(fullName = "ColumnOffset", shortName = "co", doc = "Offset of column containing the power in the pf", required = true) + int colOffset = 0; + + BufferedReader syzyFileReader; + final String pfFileDelimiter = " "; + boolean outOfLinesInSyzyFile = false; + + @Override + public void initialize() + { + super.initialize(); + try { + syzyFileReader = new BufferedReader(new FileReader(pathToSyzygyFile)); + syzyFileReader.readLine(); + } catch (FileNotFoundException e) { + String newErrMsg = "Syzygy input file " + pathToSyzygyFile + " could be incorrect. File not found."; + throw new StingException(newErrMsg,e); + } catch (IOException e) { + String newErrMsg = "Syzygy input file error: could not read first line of "+pathToSyzygyFile; + throw new StingException(newErrMsg,e); + } + + } + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) + { + if ( !super.suppress_printing ) + { + Pair powpair = super.boostrapSamplingPowerCalc(context); + + boolean syzyFileIsReady; + try { + syzyFileIsReady = syzyFileReader.ready(); + } + catch(IOException e) { + syzyFileIsReady = false; + } + + if(!syzyFileIsReady) { + throw new StingException("Input file reader was not ready before an attempt to read from it."); + } else if(!outOfLinesInSyzyFile) { + double syzyPow = getSyzyPowFromFile(); + out.printf("%s: %d %d %f %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first,syzyPow); + } else { + out.printf("%s: %d %d %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first); + } + } + + return context.getReads().size(); + } + + public double getSyzyPowFromFile() { + String thisLine = null; + try { + thisLine = syzyFileReader.readLine(); + } catch(IOException e) { + String newErrMsg = "Ran out of lines in the syzyfile; further output of Syzygy power will be suppressed."; + outOfLinesInSyzyFile=true; + logger.warn(newErrMsg + " " + e.toString()); + return -1.1; + } + + StringTokenizer lineTokenizer = new StringTokenizer(thisLine, pfFileDelimiter); + try { + for(int j = 0; j < colOffset; j++) { + lineTokenizer.nextToken(); + } + return (Double.valueOf(lineTokenizer.nextToken())/100.0); + } catch (NoSuchElementException e) { + String errMsg = "The given column offset for the pool, " + colOffset + " exceeded the number of entries in the file " + pathToSyzygyFile; + throw new StingException(errMsg); + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java new file mode 100755 index 000000000..ebfbc2d44 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * TERMS OF USE: WE ARE NOT LIABLE FOR ANYTHING + */ + + +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.genotype.GenotypeCall; + +import java.util.*; +import java.io.FileOutputStream; +import java.io.PrintWriter; +import java.io.FileNotFoundException; + +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMRecord; + +// Display the depth of coverage at a given locus and output the power + +public class ArtificialPoolWalker extends LocusWalker[], SAMFileWriter> { + @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) + public boolean suppressPrinting = false; + @Argument(fullName="reductionProportion", shortName="rp", doc="Proportion of total coverage reflected in pool file",required=false) + public double red_prop = -1; + @Argument(fullName="auxOutputFile", shortName="af", doc="Filepath for Auxiliary pool information (true genotypes, etc.)", required=true) + public String auxFilepath = null; + @Argument(fullName="outputBamFile",shortName="of",doc="Output to this file rather than to standard output") + SAMFileWriter outputBamFile = null; + + private List> readGroupSets; + private PrintWriter auxWrite; + private Pair[] local_genotypes; + private LinkedList[] living_reads; + private SingleSampleGenotyper ssg; + private int npeople; + //@param local_genotypes - holds the genotype (A A/ A C/ etc) for each individual. Updates at each locus. + //@param auxWrite - the writer to the auxiliary file + //@param readGroupSets : holds the readgroups (identifiers for individuals from each read) + //@param ssg - the instance of the single sample genotyper + //@param living_reads - holds on to the selected reads from each person until the location passes the read endpoint + // Updates at each locus. + //@param npeople - number of people represented in this pool (size of readGroupSets) + + + public void initialize() { + // initialize the output writer + try { + auxWrite = new PrintWriter(new FileOutputStream(auxFilepath)); + } + catch(FileNotFoundException e) { + String ermsg = "auxiliary file for Artificial Pool Walker was either a folder or could not be created"; + throw new StingException(ermsg, e); + } + + // create the file header and write to file + + auxWrite.printf("%s%n",createFileHeader()); + + // obtain the readgroup sets + + readGroupSets = getToolkit().getMergedReadGroupsByReaders(); + npeople = readGroupSets.size(); + + // if a reduction proportion was unspecified, set it to 1/num_files + + if(red_prop <= 0) { + red_prop = 1.0/npeople; + } else { + // do nothing muhahaha + } + + // initialize the local genotype array + + local_genotypes = new Pair[npeople]; + + // initilize the single sample genotyper + + ssg = new SingleSampleGenotyper(); + ssg.initialize(); + + // initialize the living reads array + + living_reads = new LinkedList[npeople]; + + for(int iter=0; iter < readGroupSets.size(); iter++) { + living_reads[iter] = new LinkedList(); + } + } + + public List[] map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + updateLiving(); + // each time we move to the next locus, remove from the coverage count those reads that ended + auxWrite.printf("%s:%s",context.getContig(),context.getPosition()); + + return getNewReadsAndGenotypesByGroup(tracker, ref, context); + } + + public SAMFileWriter reduceInit() { + + return outputBamFile; + } + + public SAMFileWriter reduce(List[] readsByReadGroup, SAMFileWriter outFile) { + // want to generate a sub sample of the new reads + int total_new_coverage = 0; + + for(int pers = 0; pers < npeople; pers++) { + total_new_coverage += readsByReadGroup[pers].size(); + } + + int sought_coverage = (int) Math.ceil(red_prop * (double)total_new_coverage); + List[] randomReadsByGroup = drawReadsRandomlyFromReadsByGroup(readsByReadGroup,sought_coverage); + printToFileAndAuxFile(randomReadsByGroup,sought_coverage,outFile); + return outFile; + + } + + + + // ArtificalPoolWalker helper methods begin below + + private List[] drawReadsRandomlyFromReadsByGroup(List[] readsByReadGroup, int numToDraw){ + + int[] simulatedCoverage = simulateRandomUniformDraws(numToDraw, readsByReadGroup); + + return removeReadsToCoverage(readsByReadGroup,simulatedCoverage); + } + + private List[] removeReadsToCoverage(List[] readsByReadGroup, int[] simulatedCoverage) { + for(int group = 0; group < npeople; group++) { + int cvgAtGroup = readsByReadGroup[group].size(); + for(int drawDown = simulatedCoverage[group]; drawDown > 0; drawDown --) { + readsByReadGroup[group].remove((int)Math.floor((double) cvgAtGroup*Math.random())); + cvgAtGroup--; + } + } + + return readsByReadGroup; + } + + private int[] simulateRandomUniformDraws(int numToDraw, List[] readsByReadGroup) { + int[] drawsByPerson = new int[npeople]; + + for(int pers = 0; pers < npeople; pers++) { //array instantiation + drawsByPerson[pers] = 0; + } + + for(int draw = 0; draw < numToDraw; draw++) { + int personToDrawFrom = (int) Math.floor((double) npeople * Math.random()); + drawsByPerson[personToDrawFrom]++; + } + + return redistributeSimulatedCoverage(drawsByPerson,readsByReadGroup,0, 0, null); + } + + private int[] redistributeSimulatedCoverage(int[] drawsByPerson, List[] readsByReadGroup, int excess, int nOffLimits, boolean[] offLimits) { + int [] result; + if(offLimits == null) { // first time calling + offLimits = new boolean[npeople]; + for(int i = 0; i < npeople; i++) { + offLimits[i] = false; + } + result = redistributeSimulatedCoverage(drawsByPerson,readsByReadGroup,excess,0, offLimits); + } else if (excess == 0) { // no overt excess, need to check individual persons + // get excess coverage + int newExcess = 0; + + for(int pers = 0; pers < npeople; pers++) { + if(!offLimits[pers] && drawsByPerson[pers] > readsByReadGroup[pers].size()) { + newExcess += drawsByPerson[pers] - readsByReadGroup[pers].size(); + drawsByPerson[pers] = readsByReadGroup[pers].size(); + offLimits[pers] = true; + nOffLimits++; + } else { + // do nothing + } + } + + if(newExcess == 0) { // we are done! + result = drawsByPerson; + } else { // there is now overt excess + result = redistributeSimulatedCoverage(drawsByPerson,readsByReadGroup,newExcess,nOffLimits,offLimits); + } + } else { // there are overt excess reads to distribute + int nRemaining = npeople - nOffLimits; + int[] drawsToAdd = new int[nRemaining]; + + for(int j = 0; j < nRemaining; j++) { + drawsToAdd[j] = 0; + } + + for(int draw = excess; draw > 0; draw--) { + drawsToAdd[(int) Math.floor((double)nRemaining * Math.random())]++; + } + + int notOffLimitsCounter = 0; + + for(int pers = 0; pers < npeople; pers++) { + if(! offLimits[pers]) { + drawsByPerson[pers] += drawsToAdd[notOffLimitsCounter]; + notOffLimitsCounter++; + } else { + // do nothing + } + } + result = redistributeSimulatedCoverage(drawsByPerson,readsByReadGroup,0,nOffLimits,offLimits); + } + return result; + } + + private int[] createNewPersonIndex(int peopleWithReadsLeft, int[] indexLeft, int indexToRemoveFrom) { + int[] newPersonIndex = new int[peopleWithReadsLeft-1]; + for(int i = 0; i < peopleWithReadsLeft; i++) { + if(i < indexToRemoveFrom) { + newPersonIndex[i] = indexLeft[i]; + } else { + newPersonIndex[i] = indexLeft[i+1]; + } + } + return newPersonIndex; + } + + + private List[] getNewReadsAndGenotypesByGroup(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) + { + // at the same time we will: + // 1) split reads up by read group + // 2) get new reads + // 3) get genotypes for each individual + // then we will: + // return the new reads (a subsample), stratified by read group + + List allReads = context.getReads(); + int n_reads_total = allReads.size(); + List allOffsets = context.getOffsets(); + ListIterator readIterator = allReads.listIterator(); + ListIterator offsetIterator = allOffsets.listIterator(); + LinkedList[] reads_by_group = new LinkedList[npeople]; + LinkedList[] offsets_by_group = new LinkedList[npeople]; + LinkedList[] new_reads_by_group = new LinkedList[npeople]; + + for(int j = 0; j < npeople; j++) { + reads_by_group[j] = new LinkedList(); + offsets_by_group[j] = new LinkedList(); + new_reads_by_group[j] = new LinkedList(); + } + + while(readIterator.hasNext()) { + int iter_person = 0; + SAMRecord read = (SAMRecord) readIterator.next(); + int offset = (Integer) offsetIterator.next(); + while(!readGroupSets.get(iter_person).contains((String) read.getAttribute("RG"))) { + iter_person++; + // for debug purposes only: + if(iter_person > npeople) { + throw new StingException("Read not found in any read group in ArtificialPoolWalker. Life sucks."); + } + } + // here, iter_person is the person from whom the read came so: + reads_by_group[iter_person].add(read); + offsets_by_group[iter_person].add(offset); + + // check to see if the read is new + if(offset == 0) { + new_reads_by_group[iter_person].add(read); + // add this to the new reads set to be randomly selected and printed + } + } + // now we obtain the genotypes + for(int iter=0; iter[] downSampledReadsByGroup, int cvg, SAMFileWriter outFile) + { + // NOTE: the chromosome and position were already printed in the auxFile during map. + for(int pers = 0; pers < npeople; pers++) { + List personReads = downSampledReadsByGroup[pers]; + for(SAMRecord read : personReads) { + outFile.addAlignment(read); + living_reads[pers].add(read.getReadLength()); + } + } + String auxformstring = " %s %f %d"; + // now print to the aux file + int totalcvg = 0; + for(int pers = 0; pers < npeople; pers++) { + auxWrite.printf(auxformstring, local_genotypes[pers].first, local_genotypes[pers].second,living_reads[pers].size()); + totalcvg += living_reads[pers].size(); + } + auxWrite.printf(" %d%n", totalcvg); + + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java new file mode 100755 index 000000000..61db70583 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java @@ -0,0 +1,262 @@ +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; + +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.ListUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import net.sf.samtools.SAMRecord; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Aug 18, 2009 + * Time: 11:57:07 AM + * To change this template use File | Settings | File Templates. + */ +public class CoverageAndPowerWalker extends LocusWalker> { + @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) + public boolean suppress_printing = false; + + @Argument(fullName="poolSize", shortName="ps", doc="Number of individuals in pool", required=true) + public int num_individuals = 0; + + @Argument(fullName="bootStrap", shortName="bs", doc="Use a bootstrap method", required=false) + public boolean use_bootstrap = false; + + @Argument(fullName="lodThreshold", shortName="lt", doc="Threshold for LOD score for calls") + public double threshold = 3.0; + + + + private static final int BOOTSTRAP_ITERATIONS = 300; + + + @Override + public void initialize() + { + + if(num_individuals <= 0) + throw new IllegalArgumentException("Positive nonzero parameter expected for poolSize"); + } + + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) + { + if ( !suppress_printing ) + { + Pair powpair = boostrapSamplingPowerCalc(context); + out.printf("%s: %d %d %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first); + } + + return context.getReads().size(); + } + + + public boolean isReduceByInterval() { + return true; + } + + + public Pair reduceInit() { + + return new Pair(0l,0l); } + + + public Pair reduce(Integer value, Pair sum) + { + long left = value.longValue() + sum.getFirst(); + long right = sum.getSecond() + 1l; + return new Pair(left, right); + } + + + public void onTraversalDone(Pair result) { + out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n", + ((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond()); + } + + /* + * + * Helper Methods Below + * + */ + + public Pair powerTheoretical(int depth, List reads, List offsets, double snp_prop) + { + // get the median Q score for these reads + + byte medQ = ListUtils.getQScoreMedian(reads,offsets); + // System.out.println("Median Q: " + medQ); //TODO: remove this line + double p_error = QualityUtils.qualToErrorProb(medQ); + + // variable names from here on out come from the mathematics of a log likelihood test + // with binomial probabilities, of observing k bases consistent with the same SNP + // given that SNP occurs in one of the alleles, versus it does not occur. + // Numerator and denominator will each have a term raised to the kth power + // and a term raised to the (depth - kth) (or d-kth) power. Thus the names. + // Taking a log then brings those powers out as coefficients, where they can be solved for. + + + double kterm_num = Math.log10(snp_prop * (1 - p_error) + (1 - snp_prop) * (p_error/3)); + double kterm_denom = Math.log10(p_error/3); + double dkterm_num = Math.log10(snp_prop*(p_error/3) + (1 - snp_prop) * (1 - p_error)); + double dkterm_denom = Math.log10(1 - p_error); + + int kaccept = (int) Math.ceil((threshold-((double)depth)*(dkterm_num-dkterm_denom))/(kterm_num-kterm_denom-dkterm_num+dkterm_denom)); + + // we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs + // we can optimize this by checking to see which sum is smaller + + double pow = 0; + + if(depth - kaccept < kaccept) {// kaccept > depth/2 - calculate power as P[hits between k and depth] + + for(int k = kaccept; k < depth; k++) { + + pow += MathUtils.binomialProbabilityLog(k, depth, snp_prop); + } + + return new Pair(pow,medQ); + } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and k] + for(int k = 0; k < kaccept; k++) { + pow += MathUtils.binomialProbabilityLog(k,depth,snp_prop); + } + + return new Pair(1.0-pow,medQ); + } + } + + + public Pair boostrapSamplingPowerCalc(AlignmentContext context) + { + /* it is assumed that there are 2*num_individuals chromosomes in the pool + * we use a bootstrap method to calculate our power to detect a SNP with this + * depth of coverage + * Assume that only one of the individual chromosomes has a variant at this locus + * + */ + + List reads = context.getReads(); + List offsets = context.getOffsets(); + + final int depth = reads.size(); + Pair result; + final double single_snip_proportion = 1/(2.0*num_individuals); + + if (depth <= 0) { + result = new Pair(-1,-1); + } else if (!use_bootstrap) { // object data from command line + + result = powerTheoretical(depth, reads, offsets, single_snip_proportion); + } else { + + // + // otherwise, bootstrapping occurs below + // + + int hypothesis_rejections=0; + + for(int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++) + { + List qscore_SNPs = new LinkedList(); + List qscore_refs = new LinkedList(); + + for (int strap = 0; strap < depth; strap++) { + int readOffset = randomlySelectRead(depth); + byte baseQuality = reads.get(readOffset).getBaseQualities()[offsets.get(readOffset)]; + + if (Math.random() < single_snip_proportion) {// occurs with probability "single_snip_proportion" + qscore_SNPs.add(baseQuality); + } + else { // simulating a reference read + qscore_refs.add(baseQuality); + } + } + + if (calculateLODByQLists(qscore_SNPs,qscore_refs) >= threshold) { + hypothesis_rejections++; + } + + } + + result = new Pair(((double)hypothesis_rejections)/BOOTSTRAP_ITERATIONS, ListUtils.getQScoreMedian(reads,offsets)); + } + + return result; + + } + + + public int randomlySelectRead(int depth) + { + double qscore_selector = Math.random(); + int readspositionrandom; + for(readspositionrandom = 1; readspositionrandom < ((double)depth * qscore_selector); readspositionrandom ++) { + if(readspositionrandom > depth + 1) { + throw new RuntimeException("qscore iterator exceeding possible thresholds"); + } + } + + return readspositionrandom - 1; + } + + + public double calculateLODByQLists(List q_snp, List q_ref) + { + /* LOD score calculation + * let Qs be the list q_snp and Qr q_ref and f be a function that + * converts Q-scores to probability of error, then the probability ratio + * is given by + * + * Product_{i=1}^{|Qs|}((1-f(Qs[i])/2n+(2n-1)f(Qs[i])/2n)*Product_{j=1}^{|Qr|}(((2n-1)(1-f(Qr[j]))+f(Qr[j]))/2n) + * ------------------------------------------------------------------------------------------------------------ + * Product_{i=1}^{|Qs|}f(Qs[i])*Product_{j=1}^{|Qr|}(1-f(Qr[j])) + * + * since depth = |Qr|+|Qs| the binomial coefficients cancel so are not present in this formula + * + * + * + * + * the "LOD" used here is really a log-likelihood, just the logarithm of the above + * + * Helper functions compute all but the final sum (log of mult/div becomes add/subtract) + * + */ + + Pair logsumSNP = qListToSumLogProbabilities(true,q_snp); + Pair logsumRef = qListToSumLogProbabilities(false,q_ref); + + return 0 - logsumSNP.first - logsumRef.first + logsumSNP.second + logsumRef.second; + } + + + public Pair qListToSumLogProbabilities(boolean listRepresentsSNPObservations, List qList) + { + double logProbObserveXAndSNPTrue = 0; // note "error" for SNP is observing a ref + double logProbObserveXAndRefTrue = 0;// and "error" for ref is observing a SNP + final double denom = 2*((double)num_individuals); + + for (byte qual : qList) { + double p_err = QualityUtils.qualToErrorProb(qual); + if (listRepresentsSNPObservations) { + logProbObserveXAndSNPTrue += Math.log10((1 - p_err) / denom +((denom - 1)*p_err) / denom); + logProbObserveXAndRefTrue += Math.log10(p_err); + } else { + logProbObserveXAndSNPTrue += Math.log10((denom - 1) * (1 - p_err)/denom + p_err/denom); + logProbObserveXAndRefTrue+= Math.log10(1 -p_err); + } + } + + return new Pair(logProbObserveXAndSNPTrue,logProbObserveXAndRefTrue); + } + + + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/ListUtils.java b/java/src/org/broadinstitute/sting/utils/ListUtils.java index b79b2ab0a..7ad03a03a 100644 --- a/java/src/org/broadinstitute/sting/utils/ListUtils.java +++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils; import java.util.*; +import net.sf.samtools.SAMRecord; + /** * Created by IntelliJ IDEA. * User: andrew @@ -66,9 +68,85 @@ public class ListUtils { for (int i : indices) { subset.add(list.get(i)); } - + return subset; } + public static Comparable orderStatisticSearch(int orderStat, List list) { + // this finds the order statistic of the list (kth largest element) + // the list is assumed *not* to be sorted + + final Comparable x = list.get(orderStat); + ListIterator iterator = list.listIterator(); + ArrayList lessThanX = new ArrayList(); + ArrayList equalToX = new ArrayList(); + ArrayList greaterThanX = new ArrayList(); + + for(Comparable y : list) { + if(x.compareTo(y) > 0) { + lessThanX.add(y); + } else if(x.compareTo(y) < 0) { + greaterThanX.add(y); + } else + equalToX.add(y); + } + + if(lessThanX.size() > orderStat) + return orderStatisticSearch(orderStat, lessThanX); + else if(lessThanX.size() + equalToX.size() >= orderStat) + return orderStat; + else + return orderStatisticSearch(orderStat - lessThanX.size() - equalToX.size(), greaterThanX); + + } + + + public static Object getMedian(List list) { + return orderStatisticSearch((int) Math.ceil(list.size()/2), list); + } + + public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { + // version of the order statistic calculator for SAMRecord/Integer lists, where the + // list index maps to a q-score only through the offset index + // returns the kth-largest q-score. + + + ArrayList lessThanQReads = new ArrayList(); + ArrayList equalToQReads = new ArrayList(); + ArrayList greaterThanQReads = new ArrayList(); + ArrayList lessThanQOffsets = new ArrayList(); + ArrayList greaterThanQOffsets = new ArrayList(); + + final byte qk = reads.get(k).getBaseQualities()[offsets.get(k)]; + + for(int iter = 0; iter < reads.size(); iter ++) { + SAMRecord read = reads.get(iter); + int offset = offsets.get(iter); + byte quality = read.getBaseQualities()[offset]; + + if(quality < qk) { + lessThanQReads.add(read); + lessThanQOffsets.add(offset); + } else if(quality > qk) { + greaterThanQReads.add(read); + greaterThanQOffsets.add(offset); + } else { + equalToQReads.add(reads.get(iter)); + } + } + + if(lessThanQReads.size() > k) + return getQScoreOrderStatistic(lessThanQReads, lessThanQOffsets, k); + else if(equalToQReads.size() + lessThanQReads.size() >= k) + return qk; + else + return getQScoreOrderStatistic(greaterThanQReads, greaterThanQOffsets, k - lessThanQReads.size() - equalToQReads.size()); + + } + + + public static byte getQScoreMedian(List reads, List offsets) { + return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); + } } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 0edd6427c..4f48b1e61 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -8,6 +8,17 @@ import cern.jet.math.Arithmetic; * @author Kiran Garimella */ public class MathUtils { + + /** Public constants - used for the Lanczos approximation to the factorial function + * (for the calculation of the binomial/multinomial probability in logspace) + * @param LANC_SEQ[] - an array holding the constants which correspond to the product + * of Chebyshev Polynomial coefficients, and points on the Gamma function (for interpolation) + * [see A Precision Approximation of the Gamma Function J. SIAM Numer. Anal. Ser. B, Vol. 1 1964. pp. 86-96] + * @param LANC_G - a value for the Lanczos approximation to the gamma function that works to + * high precision + */ + + /** Private constructor. No instantiating this class! */ private MathUtils() {} @@ -120,6 +131,45 @@ public class MathUtils { //return (new cern.jet.random.Binomial(n, p, cern.jet.random.engine.RandomEngine.makeDefault())).pdf(k); } + /** + * Performs the calculation for a binomial probability in logspace, preventing the blowups of the binomial coefficient + * consistent with moderately large n and k. + * + * @param k - number of successes/observations + * @param n - number of Bernoulli trials + * @param p - probability of success/observations + * + * @return the probability mass associated with a binomial mass function for the above configuration + */ + public static double binomialProbabilityLog(int k, int n, double p) { + + double log_coef = 0.0; + int min; + int max; + + if(k < n - k) { + min = k; + max = n-k; + } else { + min = n - k; + max = k; + } + + for(int i=2; i <= min; i++) { + log_coef -= Math.log((double)i); + } + + for(int i = max+1; i <= n; i++) { + log_coef += Math.log((double)i); + } + + return Math.exp(log_coef + ((double)k)*Math.log(p) + ((double)(n-k))*Math.log(1-p)); + + // in the future, we may want to precompile a table of exact values, where the entry (a,b) indicates + // the sum of log(i) from i = (1+(50*a)) to i = 50+(50*b) to increase performance on binomial coefficients + // which may require many sums. + } + /** * Computes a multinomial. This is computed using the formula * @@ -194,4 +244,6 @@ public class MathUtils { rms /= x.length; return Math.sqrt(rms); } + + }