diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java index 792459578..7285c2bb9 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java @@ -8,6 +8,8 @@ import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -15,7 +17,6 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import java.io.File; import java.io.IOException; import java.io.PrintStream; -import java.io.PrintWriter; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -27,13 +28,21 @@ import java.util.Set; * distribution of % bases and % targets covered for certain depths. The granularity of DOC can be set by command * line arguments. * - * // todo -- alter logarithmic scaling to spread out bins more - * // todo -- allow for user to set linear binning (default is logarithmic) - * // todo -- add per target (e.g. regional) aggregation * * @Author chartl * @Date Feb 22, 2010 */ +// todo [DONE] -- add per target (e.g. regional) aggregation +// todo [DONE] -- add ability to print out the calculated bins and quit (for pre-analysis bin size selection) +// todo -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps] +// todo [DONE] -- per locus output through -o +// todo -- support for using read groups instead of samples +// todo -- coverage without deletions +// todo -- base counts +// todo -- support for aggregate (ignoring sample IDs) granular histograms; maybe n*[start,stop], bins*sqrt(n) +// todo -- alter logarithmic scaling to spread out bins more +// todo -- allow for user to set linear binning (default is logarithmic) +// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now @By(DataSource.REFERENCE) public class CoverageStatistics extends LocusWalker, DepthOfCoverageStats> implements TreeReducible { @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) @@ -46,17 +55,42 @@ public class CoverageStatistics extends LocusWalker, DepthOf byte minMappingQuality = 50; @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false) byte minBaseQuality = 20; - @Argument(fullName = "perLocusStatisticsFile", shortName = "locusFile", doc = "File to output per-locus statistics to; if unprovided these will not be calculated", required = false) - File perLocusStatisticsFile = null; - @Argument(fullName = "perSampleStatisticsFile", shortName = "sampleFile", doc = "File to output per-sample statistics to; if unprovided will go to standard (-o) output", required = false) - File perSampleStatisticsFile = null; - @Argument(fullName = "summaryStatisticsFile", shortName = "summaryFile", doc = "File to output summary (mean, median) statistics to; if unprovided will go to standard (-o) output", required = false) - File summaryStatisticsFile = null; + @Argument(fullName = "omitLocusTable", shortName = "omitLocus", doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup", required = false) + boolean omitLocusTable = false; + @Argument(fullName = "omitIntervalStatistics", shortName = "omitIntervals", doc = "Will omit the per-interval statistics section, which should result in speedup", required = false) + boolean omitIntervals = false; + @Argument(fullName = "omitDepthOutputAtEachBase", shortName = "omitBaseOutput", doc = "Will omit the output of the depth of coverage at each base, which should result in speedup", required = false) + boolean omitDepthOutput = false; + @Argument(fullName = "printBinEndpointsAndExit", doc = "Prints the bin values and exits immediately. Use to calibrate what bins you want before running on data.", required = false) + boolean printBinEndpointsAndExit = false; + @Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime.", required = false) + boolean omitSampleSummary = false; //////////////////////////////////////////////////////////////////////////////////// // STANDARD WALKER METHODS //////////////////////////////////////////////////////////////////////////////////// + public void initialize() { + + if ( printBinEndpointsAndExit ) { + int[] endpoints = DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins); + System.out.print("[ "); + for ( int e : endpoints ) { + System.out.print(e+" "); + } + System.out.println("]"); + System.exit(0); + } + + if ( getToolkit().getArguments().outFileName == null ) { + throw new StingException("This walker requires that you specify an output file (-o)"); + } + } + + public boolean isReduceByInterval() { + return ( ! omitIntervals ); + } + public DepthOfCoverageStats reduceInit() { List> samplesByReaders = getToolkit().getSamplesByReaders(); DepthOfCoverageStats stats = new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins)); @@ -67,9 +101,19 @@ public class CoverageStatistics extends LocusWalker, DepthOf } } - if ( perLocusStatisticsFile != null ) { + if ( ! omitLocusTable ) { stats.initializeLocusCounts(); } + + if ( ! omitDepthOutput ) { // print header + out.printf("%s\t%s\t%s","Locus","Total_Depth","Average_Depth"); + for ( String s : stats.getAllSamples()) { + out.printf("\t%s_%s","Depth_for",s); + } + out.printf("%n"); + } else { + out.printf("Per-Locus Depth of Coverage output was omitted"); + } return stats; } @@ -90,12 +134,21 @@ public class CoverageStatistics extends LocusWalker, DepthOf depthBySample.put(sample,properDepth); } + + if ( ! omitDepthOutput ) { + out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) + } return depthBySample; } public DepthOfCoverageStats reduce(Map thisMap, DepthOfCoverageStats prevReduce) { prevReduce.update(thisMap); + if ( ! omitDepthOutput ) { + printDepths(out,thisMap, prevReduce.getAllSamples()); + // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without + // turning on omit + } return prevReduce; } @@ -104,10 +157,185 @@ public class CoverageStatistics extends LocusWalker, DepthOf return left; } + //////////////////////////////////////////////////////////////////////////////////// + // INTERVAL ON TRAVERSAL DONE + //////////////////////////////////////////////////////////////////////////////////// + + public void onTraversalDone( List> statsByInterval ) { + File intervalStatisticsFile = deriveFromStream("interval_statistics"); + File intervalSummaryFile = deriveFromStream("interval_summary"); + DepthOfCoverageStats mergedStats = printIntervalStatsAndMerge(statsByInterval,intervalSummaryFile, intervalStatisticsFile); + this.onTraversalDone(mergedStats); + } + + private DepthOfCoverageStats printIntervalStatsAndMerge(List> statsByInterval, File summaryFile, File statsFile) { + PrintStream summaryOut; + PrintStream statsOut; + + try { + summaryOut = summaryFile == null ? out : new PrintStream(summaryFile); + statsOut = statsFile == null ? out : new PrintStream(statsFile); + } catch ( IOException e ) { + throw new StingException("Unable to open interval file on reduce", e); + } + + + Pair firstPair = statsByInterval.remove(0); + DepthOfCoverageStats firstStats = firstPair.second; + + StringBuilder summaryHeader = new StringBuilder(); + summaryHeader.append("Target"); + summaryHeader.append("\ttotal_coverage"); + summaryHeader.append("\taverage_coverage"); + + for ( String s : firstStats.getAllSamples() ) { + summaryHeader.append("\t"); + summaryHeader.append(s); + summaryHeader.append("_mean_cvg"); + summaryHeader.append("\t"); + summaryHeader.append(s); + summaryHeader.append("_granular_Q1"); + summaryHeader.append("\t"); + summaryHeader.append(s); + summaryHeader.append("_granular_median"); + summaryHeader.append("\t"); + summaryHeader.append(s); + summaryHeader.append("_granular_Q3"); + } + + summaryOut.printf("%s%n",summaryHeader); + + int[][] nTargetsByAvgCvgBySample = new int[firstStats.getHistograms().size()][firstStats.getEndpoints().length+1]; + for ( int i = 0; i < nTargetsByAvgCvgBySample.length; i ++ ) { + for ( int b = 0; b < nTargetsByAvgCvgBySample[0].length; b++) { + nTargetsByAvgCvgBySample[i][b] = 0; + } + } + + printTargetSummary(summaryOut,firstPair); + updateTargetTable(nTargetsByAvgCvgBySample,firstStats); + + for ( Pair targetStats : statsByInterval ) { + printTargetSummary(summaryOut,targetStats); + updateTargetTable(nTargetsByAvgCvgBySample,targetStats.second); + firstStats = this.treeReduce(firstStats,targetStats.second); + } + + printIntervalTable(statsOut,nTargetsByAvgCvgBySample,firstStats.getEndpoints()); + + summaryOut.close(); + statsOut.close(); + + return firstStats; + } + + private void printTargetSummary(PrintStream output, Pair intervalStats) { + DepthOfCoverageStats stats = intervalStats.second; + int[] bins = stats.getEndpoints(); + StringBuilder targetSummary = new StringBuilder(); + targetSummary.append(intervalStats.first.toString()); + targetSummary.append("\t"); + targetSummary.append(stats.getTotalLoci()*stats.getMeans().get(DepthOfCoverageStats.ALL_SAMPLES)); + // TODO: change this to use the raw counts directly rather than re-estimating from mean*nloci + targetSummary.append("\t"); + targetSummary.append(stats.getMeans().get(DepthOfCoverageStats.ALL_SAMPLES)); + + for ( String s : stats.getAllSamples() ) { + targetSummary.append("\t"); + targetSummary.append(String.format("%.2f", stats.getMeans().get(s))); + targetSummary.append("\t"); + int median = getQuantile(stats.getHistograms().get(s),0.5); + int q1 = getQuantile(stats.getHistograms().get(s),0.25); + int q3 = getQuantile(stats.getHistograms().get(s),0.75); + targetSummary.append(bins[q1]); + targetSummary.append("\t"); + targetSummary.append(bins[median]); + targetSummary.append("\t"); + targetSummary.append(bins[q3]); + + } + + output.printf("%s%n", targetSummary); + } + + private void printIntervalTable(PrintStream output, int[][] intervalTable, int[] cutoffs) { + output.printf("\tdepth>=%d",0); + for ( int col = 0; col < intervalTable[0].length-1; col ++ ) { + output.printf("\tdepth>=%d",cutoffs[col]); + } + + output.printf(String.format("%n")); + for ( int row = 0; row < intervalTable.length; row ++ ) { + output.printf("At_least_%d_samples",row+1); + for ( int col = 0; col < intervalTable[0].length; col++ ) { + output.printf("\t%d",intervalTable[row][col]); + } + output.printf(String.format("%n")); + } + } + + /* + * @updateTargetTable + * The idea is to have counts for how many *targets* have at least K samples with + * median coverage of at least X. + * To that end: + * Iterate over the samples the DOCS object, determine how many there are with + * median coverage > leftEnds[0]; how many with median coverage > leftEnds[1] + * and so on. Then this target has at least N, N-1, N-2, ... 1, 0 samples covered + * to leftEnds[0] and at least M,M-1,M-2,...1,0 samples covered to leftEnds[1] + * and so on. + */ + private void updateTargetTable(int[][] table, DepthOfCoverageStats stats) { + int[] cutoffs = stats.getEndpoints(); + int[] countsOfMediansAboveCutoffs = new int[cutoffs.length+1]; // 0 bin to catch everything + for ( int i = 0; i < countsOfMediansAboveCutoffs.length; i ++) { + countsOfMediansAboveCutoffs[i]=0; + } + + for ( String s : stats.getAllSamples() ) { + int medianBin = getQuantile(stats.getHistograms().get(s),0.5); + for ( int i = 0; i <= medianBin; i ++) { + countsOfMediansAboveCutoffs[i]++; + } + } + + for ( int medianBin = 0; medianBin < countsOfMediansAboveCutoffs.length; medianBin++) { + for ( ; countsOfMediansAboveCutoffs[medianBin] > 0; countsOfMediansAboveCutoffs[medianBin]-- ) { + table[countsOfMediansAboveCutoffs[medianBin]-1][medianBin]++; + // the -1 is due to counts being 1-based and offsets being 0-based + } + } + } + + //////////////////////////////////////////////////////////////////////////////////// + // FINAL ON TRAVERSAL DONE + //////////////////////////////////////////////////////////////////////////////////// + public void onTraversalDone(DepthOfCoverageStats coverageProfiles) { - printSummary(out,summaryStatisticsFile,coverageProfiles); - printPerSample(out,perSampleStatisticsFile,coverageProfiles); - printPerLocus(perLocusStatisticsFile,coverageProfiles); + /////////////////// + // OPTIONAL OUTPUTS + ////////////////// + + if ( ! omitSampleSummary ) { + File summaryStatisticsFile = deriveFromStream("summary_statistics"); + File perSampleStatisticsFile = deriveFromStream("sample_statistics"); + printSummary(out,summaryStatisticsFile,coverageProfiles); + printPerSample(out,perSampleStatisticsFile,coverageProfiles); + } + + if ( ! omitLocusTable ) { + File perLocusStatisticsFile = deriveFromStream("locus_statistics"); + printPerLocus(perLocusStatisticsFile,coverageProfiles); + } + } + + public File deriveFromStream(String append) { + String name = getToolkit().getArguments().outFileName; + if ( name.contains("stdout") || name.contains("Stdout") || name.contains("STDOUT")) { + return null; + } else { + return new File(name+"."+append); + } } //////////////////////////////////////////////////////////////////////////////////// @@ -194,9 +422,14 @@ public class CoverageStatistics extends LocusWalker, DepthOf int[] leftEnds = stats.getEndpoints(); for ( String s : histograms.keySet() ) { - int median = getQuantile(histograms.get(s),0.5); - int q1 = getQuantile(histograms.get(s),0.25); - int q3 = getQuantile(histograms.get(s),0.75); + int[] histogram = histograms.get(s); + int median = getQuantile(histogram,0.5); + int q1 = getQuantile(histogram,0.25); + int q3 = getQuantile(histogram,0.75); + // if any of these are larger than the higest bin, put the median as in the largest bin + median = median == histogram.length-1 ? histogram.length-2 : median; + q1 = q1 == histogram.length-1 ? histogram.length-2 : q1; + q3 = q3 == histogram.length-1 ? histogram.length-2 : q3; output.printf("%s\t%.2f\t%d\t%d\t%d%n",s,means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); } @@ -217,22 +450,47 @@ public class CoverageStatistics extends LocusWalker, DepthOf bin++; } - bin = bin == histogram.length-1 ? histogram.length-2 : bin; - return bin; } + + private void printDepths(PrintStream stream, Map depthBySample, Set allSamples) { + // get the depths per sample and build up the output string while tabulating total and average coverage + StringBuilder perSampleOutput = new StringBuilder(); + int tDepth = 0; + for ( String s : allSamples ) { + perSampleOutput.append("\t"); + int dp = depthBySample.keySet().contains(s) ? depthBySample.get(s) : 0; + perSampleOutput.append(dp); + tDepth += dp; + } + // remember -- genome locus was printed in map() + stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput); + + } } class DepthOfCoverageStats { + //////////////////////////////////////////////////////////////////////////////////// + // STATIC DATA + //////////////////////////////////////////////////////////////////////////////////// + public static String ALL_SAMPLES = "ALL_COMBINED_SAMPLES"; - // STANDARD (constantly updated) DATA + + //////////////////////////////////////////////////////////////////////////////////// + // STANDARD DATA + //////////////////////////////////////////////////////////////////////////////////// + private Map granularHistogramBySample; // holds the counts per each bin private Map meanCoverages; // holds mean coverage per sample private int[] binLeftEndpoints; // describes the left endpoint for each bin private int[][] locusCoverageCounts; // holds counts of number of bases with >=X samples at >=Y coverage private boolean tabulateLocusCounts = false; private int nLoci; // number of loci seen - // TEMPORARY DATA (not worth re-instantiating every time) + + //////////////////////////////////////////////////////////////////////////////////// + // TEMPORARY DATA ( not worth re-instantiating ) + //////////////////////////////////////////////////////////////////////////////////// + private int[] locusHistogram; // holds a histogram for each locus; reset after each update() call private int totalDepth; // holds the total depth of coverage for each locus; reset after each update() call @@ -441,5 +699,9 @@ class DepthOfCoverageStats { public int getTotalLoci() { return nLoci; } + + public Set getAllSamples() { + return granularHistogramBySample.keySet(); + } } diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatisticsIntegrationTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatisticsIntegrationTest.java new file mode 100644 index 000000000..28b73ced1 --- /dev/null +++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatisticsIntegrationTest.java @@ -0,0 +1,39 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.junit.Test; + +import java.util.Arrays; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Feb 25, 2010 + */ +public class CoverageStatisticsIntegrationTest extends WalkerTest { + + private boolean RUN_TESTS = false; + private String root = "-T CoverageStatistics "; + + private String buildRootCmd(String ref, String bam, String interval) { + return root + "-R "+ref+" -I "+bam+" -L "+interval+" -o %s"; + } + + private void execute(String name, WalkerTestSpec spec) { + if ( RUN_TESTS ) { + executeTest(name,spec); + } + } + + @Test + public void testBaseOutputNoFiltering() { + String bam_file = "/humgen/gsa-hphome1/chartl/projects/depthOfCoverage/testFiles/bams/Ciliopathies_1_88534_3_samples.bam"; + String interval_list = "chr1:855534"; + String reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; + String cmd = buildRootCmd(reference,bam_file,interval_list) + " -mmq 0 -mbq 0 -omitSampleSummary -omitIntervals -omitLocus"; + String expected = "2aee1dbcb69bf1e874d56cd23336afa8"; + WalkerTestSpec spec = new WalkerTestSpec(cmd,1, Arrays.asList(expected)); + execute("testBaseOutputNoFiltering",spec); + } +}