From 87f8fb7282dd6e43fac0ed2f797a841493f851f5 Mon Sep 17 00:00:00 2001 From: chartl Date: Fri, 26 Feb 2010 16:39:47 +0000 Subject: [PATCH] Quick commit in advance of Aaron's. Just a bunch of refactoring (private classes separated out, put in proper package). Also support added for coverage by read group rather than sample. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2897 348d0f76-0448-11de-a6fe-93d51630548a --- .../{ => coverage}/CoverageStatistics.java | 283 ++---------------- .../coverage/DepthOfCoverageStats.java | 248 +++++++++++++++ .../CoverageStatisticsIntegrationTest.java | 2 +- 3 files changed, 279 insertions(+), 254 deletions(-) rename java/src/org/broadinstitute/sting/oneoffprojects/walkers/{ => coverage}/CoverageStatistics.java (68%) create mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java rename java/test/org/broadinstitute/sting/oneoffprojects/walkers/{ => coverage}/CoverageStatisticsIntegrationTest.java (95%) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java similarity index 68% rename from java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java rename to java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java index d2b832ad3..9a4a96d2e 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; +package org.broadinstitute.sting.oneoffprojects.walkers.coverage; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -31,7 +31,7 @@ import java.util.*; */ // todo [DONE] -- add per target (e.g. regional) aggregation // todo [DONE] -- add ability to print out the calculated bins and quit (for pre-analysis bin size selection) -// todo -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps] +// todo [DONE] -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps] // todo [DONE] -- per locus output through -o // todo -- support for using read groups instead of samples // todo -- coverage without deletions @@ -62,6 +62,8 @@ public class CoverageStatistics extends LocusWalker, DepthOf boolean printBinEndpointsAndExit = false; @Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime.", required = false) boolean omitSampleSummary = false; + @Argument(fullName = "useReadGroups", shortName = "rg", doc = "Split depth of coverage output by read group rather than by sample", required = false) + boolean useReadGroup = false; //////////////////////////////////////////////////////////////////////////////////// // STANDARD WALKER METHODS @@ -86,13 +88,7 @@ public class CoverageStatistics extends LocusWalker, DepthOf if ( ! omitDepthOutput ) { // print header out.printf("%s\t%s\t%s","Locus","Total_Depth","Average_Depth"); // get all the samples - HashSet allSamples = new HashSet(); // since the DOCS object uses a HashMap, this will be in the same order - - for ( Set sampleSet : getToolkit().getSamplesByReaders()) { - for (String s : sampleSet) { - allSamples.add(s); - } - } + HashSet allSamples = getSamplesFromToolKit(useReadGroup); for ( String s : allSamples) { out.printf("\t%s_%s","Depth_for",s); @@ -104,21 +100,39 @@ public class CoverageStatistics extends LocusWalker, DepthOf out.printf("Per-Locus Depth of Coverage output was omitted"); } } + + private HashSet getSamplesFromToolKit( boolean getReadGroupsInstead ) { + HashSet partitions = new HashSet(); // since the DOCS object uses a HashMap, this will be in the same order + + if ( getReadGroupsInstead ) { + for ( Set rgSet : getToolkit().getMergedReadGroupsByReaders() ) { + for ( String rg : rgSet ) { + partitions.add(rg); + } + } + } else { + for ( Set sampleSet : getToolkit().getSamplesByReaders()) { + for (String s : sampleSet) { + partitions.add(s); + } + } + } + + return partitions; + } public boolean isReduceByInterval() { return ( ! omitIntervals ); } public DepthOfCoverageStats reduceInit() { - List> samplesByReaders = getToolkit().getSamplesByReaders(); DepthOfCoverageStats stats = new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins)); - for ( Set sampleSet : samplesByReaders ) { - for ( String sample : sampleSet ) { - stats.addSample(sample); - } + for ( String sample : getSamplesFromToolKit(useReadGroup) ) { + stats.addSample(sample); } + if ( ! omitLocusTable ) { stats.initializeLocusCounts(); } @@ -127,7 +141,7 @@ public class CoverageStatistics extends LocusWalker, DepthOf } public Map map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - Map contextsBySample = + Map contextsBySample = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup()); HashMap depthBySample = new HashMap(); @@ -479,241 +493,4 @@ public class CoverageStatistics extends LocusWalker, DepthOf stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput); } -} - -class DepthOfCoverageStats { - //////////////////////////////////////////////////////////////////////////////////// - // STATIC DATA - //////////////////////////////////////////////////////////////////////////////////// - - /* none so far */ - - //////////////////////////////////////////////////////////////////////////////////// - // STANDARD DATA - //////////////////////////////////////////////////////////////////////////////////// - - private Map granularHistogramBySample; // holds the counts per each bin - private Map totalCoverages; // holds total coverage per sample - private int[] binLeftEndpoints; // describes the left endpoint for each bin - private int[][] locusCoverageCounts; // holds counts of number of bases with >=X samples at >=Y coverage - private boolean tabulateLocusCounts = false; - private long nLoci; // number of loci seen - private long totalDepthOfCoverage; - - //////////////////////////////////////////////////////////////////////////////////// - // TEMPORARY DATA ( not worth re-instantiating ) - //////////////////////////////////////////////////////////////////////////////////// - - private int[] locusHistogram; // holds a histogram for each locus; reset after each update() call - private int totalLocusDepth; // holds the total depth of coverage for each locus; reset after each update() call - - //////////////////////////////////////////////////////////////////////////////////// - // STATIC METHODS - //////////////////////////////////////////////////////////////////////////////////// - - public static int[] calculateBinEndpoints(int lower, int upper, int bins) { - if ( bins > upper - lower || lower < 1 ) { - throw new IllegalArgumentException("Illegal argument to calculateBinEndpoints; "+ - "lower bound must be at least 1, and number of bins may not exceed stop - start"); - } - - int[] binLeftEndpoints = new int[bins+1]; - binLeftEndpoints[0] = lower; - - int length = upper - lower; - double scale = Math.log10((double) length)/bins; - - for ( int b = 1; b < bins ; b++ ) { - int leftEnd = lower + (int) Math.floor(Math.pow(10.0,(b-1.0)*scale)); - // todo -- simplify to length^(scale/bins); make non-constant to put bin ends in more "useful" - // todo -- positions on the number line - while ( leftEnd <= binLeftEndpoints[b-1] ) { - leftEnd++; - } - - binLeftEndpoints[b] = leftEnd; - } - - binLeftEndpoints[binLeftEndpoints.length-1] = upper; - - return binLeftEndpoints; - } - - //////////////////////////////////////////////////////////////////////////////////// - // INITIALIZATION METHODS - //////////////////////////////////////////////////////////////////////////////////// - - public DepthOfCoverageStats(int[] leftEndpoints) { - this.binLeftEndpoints = leftEndpoints; - granularHistogramBySample = new HashMap(); - totalCoverages = new HashMap(); - nLoci = 0; - totalLocusDepth = 0; - totalDepthOfCoverage = 0; - } - - public void addSample(String sample) { - if ( granularHistogramBySample.containsKey(sample) ) { - return; - } - - int[] binCounts = new int[this.binLeftEndpoints.length+1]; - for ( int b = 0; b < binCounts.length; b ++ ) { - binCounts[b] = 0; - } - - granularHistogramBySample.put(sample,binCounts); - totalCoverages.put(sample,0l); - } - - public void initializeLocusCounts() { - locusCoverageCounts = new int[granularHistogramBySample.size()][binLeftEndpoints.length+1]; - locusHistogram = new int[binLeftEndpoints.length+1]; - for ( int b = 0; b < binLeftEndpoints.length+1; b ++ ) { - for ( int a = 0; a < granularHistogramBySample.size(); a ++ ) { - locusCoverageCounts[a][b] = 0; - } - locusHistogram[b] = 0; - } - - tabulateLocusCounts = true; - } - - //////////////////////////////////////////////////////////////////////////////////// - // UPDATE METHODS - //////////////////////////////////////////////////////////////////////////////////// - - public void update(Map depthBySample) { - int b; - for ( String sample : granularHistogramBySample.keySet() ) { - if ( depthBySample.containsKey(sample) ) { - b = updateSample(sample,depthBySample.get(sample)); - totalLocusDepth += depthBySample.get(sample); - } else { - b = updateSample(sample,0); - } - - if ( tabulateLocusCounts ) { - for ( int i = 0; i <= b; i ++ ) { - locusHistogram[i]++; - } - } - } - updateLocusCounts(locusHistogram); - - nLoci++; - totalDepthOfCoverage += totalLocusDepth; - totalLocusDepth = 0; - } - - private int updateSample(String sample, int depth) { - totalCoverages.put(sample,totalCoverages.get(sample)+depth); - - int[] granularBins = granularHistogramBySample.get(sample); - for ( int b = 0; b < binLeftEndpoints.length; b ++ ) { - if ( depth < binLeftEndpoints[b] ) { - granularBins[b]++; - return b; - } - } - - granularBins[binLeftEndpoints.length]++; // greater than all left-endpoints - return binLeftEndpoints.length; - } - - public void merge(DepthOfCoverageStats newStats) { - this.mergeSamples(newStats); - if ( this.tabulateLocusCounts && newStats.tabulateLocusCounts ) { - this.mergeLocusCounts(newStats.getLocusCounts()); - } - nLoci += newStats.getTotalLoci(); - totalDepthOfCoverage += newStats.getTotalCoverage(); - } - - private void mergeSamples(DepthOfCoverageStats otherStats) { - Map otherHistogram = otherStats.getHistograms(); - Map otherMeans = otherStats.getMeans(); - for ( String s : this.getAllSamples() ) { - int[] internalCounts = granularHistogramBySample.get(s); - int[] externalCounts = otherHistogram.get(s); - for ( int b = 0; b < internalCounts.length; b++ ) { - internalCounts[b] += externalCounts[b]; - } - - this.totalCoverages.put(s, this.totalCoverages.get(s) + otherStats.totalCoverages.get(s)); - } - } - - private void mergeLocusCounts( int[][] otherCounts ) { - for ( int a = 0; a < locusCoverageCounts.length; a ++ ) { - for ( int b = 0; b < locusCoverageCounts[0].length; b ++ ) { - locusCoverageCounts[a][b] += otherCounts[a][b]; - } - } - } - - /* - * Update locus counts -- takes an array in which the number of samples - * with depth ABOVE [i] is held. So if the bin left endpoints were 2, 5, 10 - * then we'd have an array that represented: - * [# samples with depth 0 - inf], [# samples with depth 2 - inf], - * [# samples with depth 5 - inf], [# samples with depth 10-inf]; - * - * this is - * @argument cumulativeSamplesByDepthBin - see above - */ - private void updateLocusCounts(int[] cumulativeSamplesByDepthBin) { - if ( tabulateLocusCounts ) { - for ( int bin = 0; bin < cumulativeSamplesByDepthBin.length; bin ++ ) { - int numSamples = cumulativeSamplesByDepthBin[bin]; - for ( int i = 0; i < numSamples; i ++ ) { - locusCoverageCounts[i][bin]++; - } - - cumulativeSamplesByDepthBin[bin] = 0; // reset counts in advance of next update() - } - } - } - - //////////////////////////////////////////////////////////////////////////////////// - // ACCESSOR METHODS - //////////////////////////////////////////////////////////////////////////////////// - - public Map getHistograms() { - return granularHistogramBySample; - } - - public int[][] getLocusCounts() { - return locusCoverageCounts; - } - - public int[] getEndpoints() { - return binLeftEndpoints; - } - - public Map getMeans() { - HashMap means = new HashMap(); - for ( String s : getAllSamples() ) { - means.put(s,( (double)totalCoverages.get(s))/( (double) nLoci )); - } - - return means; - } - - public long getTotalLoci() { - return nLoci; - } - - public Set getAllSamples() { - return granularHistogramBySample.keySet(); - } - - public double getTotalMeanCoverage() { - return ( (double) totalDepthOfCoverage )/ ( (double) nLoci ); - } - - public long getTotalCoverage() { - return totalDepthOfCoverage; - } - -} +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java new file mode 100644 index 000000000..69e692d58 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/DepthOfCoverageStats.java @@ -0,0 +1,248 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.coverage; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Feb 26, 2010 + */ +public class DepthOfCoverageStats { + //////////////////////////////////////////////////////////////////////////////////// + // STATIC DATA + //////////////////////////////////////////////////////////////////////////////////// + + /* none so far */ + + //////////////////////////////////////////////////////////////////////////////////// + // STANDARD DATA + //////////////////////////////////////////////////////////////////////////////////// + + private Map granularHistogramBySample; // holds the counts per each bin + private Map totalCoverages; // holds total coverage per sample + private int[] binLeftEndpoints; // describes the left endpoint for each bin + private int[][] locusCoverageCounts; // holds counts of number of bases with >=X samples at >=Y coverage + private boolean tabulateLocusCounts = false; + private long nLoci; // number of loci seen + private long totalDepthOfCoverage; + + //////////////////////////////////////////////////////////////////////////////////// + // TEMPORARY DATA ( not worth re-instantiating ) + //////////////////////////////////////////////////////////////////////////////////// + + private int[] locusHistogram; // holds a histogram for each locus; reset after each update() call + private int totalLocusDepth; // holds the total depth of coverage for each locus; reset after each update() call + + //////////////////////////////////////////////////////////////////////////////////// + // STATIC METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public static int[] calculateBinEndpoints(int lower, int upper, int bins) { + if ( bins > upper - lower || lower < 1 ) { + throw new IllegalArgumentException("Illegal argument to calculateBinEndpoints; "+ + "lower bound must be at least 1, and number of bins may not exceed stop - start"); + } + + int[] binLeftEndpoints = new int[bins+1]; + binLeftEndpoints[0] = lower; + + int length = upper - lower; + double scale = Math.log10((double) length)/bins; + + for ( int b = 1; b < bins ; b++ ) { + int leftEnd = lower + (int) Math.floor(Math.pow(10.0,(b-1.0)*scale)); + // todo -- simplify to length^(scale/bins); make non-constant to put bin ends in more "useful" + // todo -- positions on the number line + while ( leftEnd <= binLeftEndpoints[b-1] ) { + leftEnd++; + } + + binLeftEndpoints[b] = leftEnd; + } + + binLeftEndpoints[binLeftEndpoints.length-1] = upper; + + return binLeftEndpoints; + } + + //////////////////////////////////////////////////////////////////////////////////// + // INITIALIZATION METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public DepthOfCoverageStats(int[] leftEndpoints) { + this.binLeftEndpoints = leftEndpoints; + granularHistogramBySample = new HashMap(); + totalCoverages = new HashMap(); + nLoci = 0; + totalLocusDepth = 0; + totalDepthOfCoverage = 0; + } + + public void addSample(String sample) { + if ( granularHistogramBySample.containsKey(sample) ) { + return; + } + + int[] binCounts = new int[this.binLeftEndpoints.length+1]; + for ( int b = 0; b < binCounts.length; b ++ ) { + binCounts[b] = 0; + } + + granularHistogramBySample.put(sample,binCounts); + totalCoverages.put(sample,0l); + } + + public void initializeLocusCounts() { + locusCoverageCounts = new int[granularHistogramBySample.size()][binLeftEndpoints.length+1]; + locusHistogram = new int[binLeftEndpoints.length+1]; + for ( int b = 0; b < binLeftEndpoints.length+1; b ++ ) { + for ( int a = 0; a < granularHistogramBySample.size(); a ++ ) { + locusCoverageCounts[a][b] = 0; + } + locusHistogram[b] = 0; + } + + tabulateLocusCounts = true; + } + + //////////////////////////////////////////////////////////////////////////////////// + // UPDATE METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public void update(Map depthBySample) { + int b; + for ( String sample : granularHistogramBySample.keySet() ) { + if ( depthBySample.containsKey(sample) ) { + b = updateSample(sample,depthBySample.get(sample)); + totalLocusDepth += depthBySample.get(sample); + } else { + b = updateSample(sample,0); + } + + if ( tabulateLocusCounts ) { + for ( int i = 0; i <= b; i ++ ) { + locusHistogram[i]++; + } + } + } + updateLocusCounts(locusHistogram); + + nLoci++; + totalDepthOfCoverage += totalLocusDepth; + totalLocusDepth = 0; + } + + private int updateSample(String sample, int depth) { + totalCoverages.put(sample,totalCoverages.get(sample)+depth); + + int[] granularBins = granularHistogramBySample.get(sample); + for ( int b = 0; b < binLeftEndpoints.length; b ++ ) { + if ( depth < binLeftEndpoints[b] ) { + granularBins[b]++; + return b; + } + } + + granularBins[binLeftEndpoints.length]++; // greater than all left-endpoints + return binLeftEndpoints.length; + } + + public void merge(DepthOfCoverageStats newStats) { + this.mergeSamples(newStats); + if ( this.tabulateLocusCounts && newStats.tabulateLocusCounts ) { + this.mergeLocusCounts(newStats.getLocusCounts()); + } + nLoci += newStats.getTotalLoci(); + totalDepthOfCoverage += newStats.getTotalCoverage(); + } + + private void mergeSamples(DepthOfCoverageStats otherStats) { + Map otherHistogram = otherStats.getHistograms(); + Map otherMeans = otherStats.getMeans(); + for ( String s : this.getAllSamples() ) { + int[] internalCounts = granularHistogramBySample.get(s); + int[] externalCounts = otherHistogram.get(s); + for ( int b = 0; b < internalCounts.length; b++ ) { + internalCounts[b] += externalCounts[b]; + } + + this.totalCoverages.put(s, this.totalCoverages.get(s) + otherStats.totalCoverages.get(s)); + } + } + + private void mergeLocusCounts( int[][] otherCounts ) { + for ( int a = 0; a < locusCoverageCounts.length; a ++ ) { + for ( int b = 0; b < locusCoverageCounts[0].length; b ++ ) { + locusCoverageCounts[a][b] += otherCounts[a][b]; + } + } + } + + /* + * Update locus counts -- takes an array in which the number of samples + * with depth ABOVE [i] is held. So if the bin left endpoints were 2, 5, 10 + * then we'd have an array that represented: + * [# samples with depth 0 - inf], [# samples with depth 2 - inf], + * [# samples with depth 5 - inf], [# samples with depth 10-inf]; + * + * this is + * @argument cumulativeSamplesByDepthBin - see above + */ + private void updateLocusCounts(int[] cumulativeSamplesByDepthBin) { + if ( tabulateLocusCounts ) { + for ( int bin = 0; bin < cumulativeSamplesByDepthBin.length; bin ++ ) { + int numSamples = cumulativeSamplesByDepthBin[bin]; + for ( int i = 0; i < numSamples; i ++ ) { + locusCoverageCounts[i][bin]++; + } + + cumulativeSamplesByDepthBin[bin] = 0; // reset counts in advance of next update() + } + } + } + + //////////////////////////////////////////////////////////////////////////////////// + // ACCESSOR METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public Map getHistograms() { + return granularHistogramBySample; + } + + public int[][] getLocusCounts() { + return locusCoverageCounts; + } + + public int[] getEndpoints() { + return binLeftEndpoints; + } + + public Map getMeans() { + HashMap means = new HashMap(); + for ( String s : getAllSamples() ) { + means.put(s,( (double)totalCoverages.get(s))/( (double) nLoci )); + } + + return means; + } + + public long getTotalLoci() { + return nLoci; + } + + public Set getAllSamples() { + return granularHistogramBySample.keySet(); + } + + public double getTotalMeanCoverage() { + return ( (double) totalDepthOfCoverage )/ ( (double) nLoci ); + } + + public long getTotalCoverage() { + return totalDepthOfCoverage; + } + +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatisticsIntegrationTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatisticsIntegrationTest.java similarity index 95% rename from java/test/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatisticsIntegrationTest.java rename to java/test/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatisticsIntegrationTest.java index 28b73ced1..fff2ed301 100644 --- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatisticsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatisticsIntegrationTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.oneoffprojects.walkers; +package org.broadinstitute.sting.oneoffprojects.walkers.coverage; import org.broadinstitute.sting.WalkerTest; import org.junit.Test;