diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java index 94739d151..0d9a3d2c7 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.utils.ExpandingArrayList; import java.util.*; @@ -19,29 +20,37 @@ public class CoverageHistogram extends LocusWalker //@Argument(fullName="start", shortName="start", required=false, doc="start") public Integer START = 0; // Private state. - long[] coverage_hist; - int max_depth; - long sum_coverage; - long num_sites; + //long[] coverage_hist; + ExpandingArrayList coverage_hist = new ExpandingArrayList(); + int max_depth = 0; + + long sum_coverage = 0; + long num_sites = 0; ///////// // Walker Interface Functions public void initialize() { - coverage_hist = new long[1000000]; - Arrays.fill(coverage_hist, 0); - max_depth = 0; - - sum_coverage = 0; - num_sites = 0; + //coverage_hist = new long[1000000]; + //Arrays.fill(coverage_hist, 0); } + private void incCov(int depth) { + int c = coverage_hist.expandingGet(depth, 0); + coverage_hist.set(depth, c + 1); + } + + private int getCov(int depth) { + return coverage_hist.get(depth); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (ref.getBase() == 'N') { return null; } int depth = context.getReads().size(); - coverage_hist[depth] += 1; + incCov(depth); if (depth > max_depth) { max_depth = depth; } sum_coverage += depth; @@ -53,40 +62,40 @@ public class CoverageHistogram extends LocusWalker public void onTraversalDone(Integer sum) { double mean_coverage = (double)sum_coverage / (double)num_sites; - double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - coverage_hist[0])); + double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - getCov(0))); out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites); - out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - coverage_hist[0]); + out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - getCov(0)); // Code for calculting std devs adapted from Michael Melgar's python script // Find the maximum extent of 'good' data // First, find the mode - long maxValue = coverage_hist[1]; // ignore doc=0 + long maxValue = getCov(1); // ignore doc=0 int mode = 1; for (int i = 2; i <= max_depth; i++) { - if (coverage_hist[i] > maxValue) { - maxValue = coverage_hist[i]; + if (getCov(i) > maxValue) { + maxValue = getCov(i); mode = i; } } // now, procede to find end of good Gaussian fit long dist = (long)Math.pow(10, 9); - while ( Math.abs(coverage_hist[mode] - coverage_hist[1]) < dist ) - dist = Math.abs(coverage_hist[mode++] - coverage_hist[1]); + while ( Math.abs(getCov(mode) - getCov(1)) < dist ) + dist = Math.abs(getCov(mode++) - getCov(1)); int maxGoodDepth = mode + 1; // calculate the mean of the good region long totalGoodSites = 0, totalGoodDepth = 0; for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 - totalGoodSites += coverage_hist[i]; - totalGoodDepth += i * coverage_hist[i]; + totalGoodSites += getCov(i); + totalGoodDepth += i * getCov(i); } double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites; // calculate the variance and standard deviation of the good region double var = 0.0; for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 - var += coverage_hist[i] * Math.pow(meanGoodDepth - (double)i, 2); + var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2); } double stdev = Math.sqrt(var / (double)totalGoodSites); out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev); @@ -97,9 +106,9 @@ public class CoverageHistogram extends LocusWalker out.println("\ndepth count freq(percent)"); for (int i = 0; i <= max_depth; i++) { - out.printf("%d %d %f\n", i, coverage_hist[i], (100.0*coverage_hist[i]) / (double)num_sites); + out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)num_sites); } - return; + } public Integer reduceInit() diff --git a/java/src/org/broadinstitute/sting/utils/ExpandingArrayList.java b/java/src/org/broadinstitute/sting/utils/ExpandingArrayList.java index 10f3f8c95..2c32ceafc 100755 --- a/java/src/org/broadinstitute/sting/utils/ExpandingArrayList.java +++ b/java/src/org/broadinstitute/sting/utils/ExpandingArrayList.java @@ -22,13 +22,22 @@ public class ExpandingArrayList extends ArrayList { return null; } - public E set(int index, E element) { + public E expandingGet(int index, E default_value) throws IndexOutOfBoundsException { + maybeExpand(index, default_value); + return super.get(index); + } + + private void maybeExpand(int index, E value) { if ( index >= size() ) { // We need to add null items until we can safely set index to element for ( int i = size(); i <= index; i++ ) - add(null); + add(value); } + } + + public E set(int index, E element) { + maybeExpand(index, null); return super.set(index, element); } } \ No newline at end of file