Now uses expanding array list for coverage histograms. No hard limit on maximum depth now

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1643 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-09-16 23:27:25 +00:00
parent 4ad46590a3
commit 73bec6f36d
2 changed files with 43 additions and 25 deletions

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.utils.ExpandingArrayList;
import java.util.*;
@ -19,29 +20,37 @@ public class CoverageHistogram extends LocusWalker<Integer,Integer>
//@Argument(fullName="start", shortName="start", required=false, doc="start") public Integer START = 0;
// Private state.
long[] coverage_hist;
int max_depth;
long sum_coverage;
long num_sites;
//long[] coverage_hist;
ExpandingArrayList<Integer> coverage_hist = new ExpandingArrayList<Integer>();
int max_depth = 0;
long sum_coverage = 0;
long num_sites = 0;
/////////
// Walker Interface Functions
public void initialize()
{
coverage_hist = new long[1000000];
Arrays.fill(coverage_hist, 0);
max_depth = 0;
sum_coverage = 0;
num_sites = 0;
//coverage_hist = new long[1000000];
//Arrays.fill(coverage_hist, 0);
}
private void incCov(int depth) {
int c = coverage_hist.expandingGet(depth, 0);
coverage_hist.set(depth, c + 1);
}
private int getCov(int depth) {
return coverage_hist.get(depth);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context)
{
if (ref.getBase() == 'N') { return null; }
int depth = context.getReads().size();
coverage_hist[depth] += 1;
incCov(depth);
if (depth > max_depth) { max_depth = depth; }
sum_coverage += depth;
@ -53,40 +62,40 @@ public class CoverageHistogram extends LocusWalker<Integer,Integer>
public void onTraversalDone(Integer sum)
{
double mean_coverage = (double)sum_coverage / (double)num_sites;
double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - coverage_hist[0]));
double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - getCov(0)));
out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites);
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - coverage_hist[0]);
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - getCov(0));
// Code for calculting std devs adapted from Michael Melgar's python script
// Find the maximum extent of 'good' data
// First, find the mode
long maxValue = coverage_hist[1]; // ignore doc=0
long maxValue = getCov(1); // ignore doc=0
int mode = 1;
for (int i = 2; i <= max_depth; i++) {
if (coverage_hist[i] > maxValue) {
maxValue = coverage_hist[i];
if (getCov(i) > maxValue) {
maxValue = getCov(i);
mode = i;
}
}
// now, procede to find end of good Gaussian fit
long dist = (long)Math.pow(10, 9);
while ( Math.abs(coverage_hist[mode] - coverage_hist[1]) < dist )
dist = Math.abs(coverage_hist[mode++] - coverage_hist[1]);
while ( Math.abs(getCov(mode) - getCov(1)) < dist )
dist = Math.abs(getCov(mode++) - getCov(1));
int maxGoodDepth = mode + 1;
// calculate the mean of the good region
long totalGoodSites = 0, totalGoodDepth = 0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
totalGoodSites += coverage_hist[i];
totalGoodDepth += i * coverage_hist[i];
totalGoodSites += getCov(i);
totalGoodDepth += i * getCov(i);
}
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
// calculate the variance and standard deviation of the good region
double var = 0.0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
var += coverage_hist[i] * Math.pow(meanGoodDepth - (double)i, 2);
var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2);
}
double stdev = Math.sqrt(var / (double)totalGoodSites);
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
@ -97,9 +106,9 @@ public class CoverageHistogram extends LocusWalker<Integer,Integer>
out.println("\ndepth count freq(percent)");
for (int i = 0; i <= max_depth; i++)
{
out.printf("%d %d %f\n", i, coverage_hist[i], (100.0*coverage_hist[i]) / (double)num_sites);
out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)num_sites);
}
return;
}
public Integer reduceInit()

View File

@ -22,13 +22,22 @@ public class ExpandingArrayList<E> extends ArrayList<E> {
return null;
}
public E set(int index, E element) {
public E expandingGet(int index, E default_value) throws IndexOutOfBoundsException {
maybeExpand(index, default_value);
return super.get(index);
}
private void maybeExpand(int index, E value) {
if ( index >= size() ) {
// We need to add null items until we can safely set index to element
for ( int i = size(); i <= index; i++ )
add(null);
add(value);
}
}
public E set(int index, E element) {
maybeExpand(index, null);
return super.set(index, element);
}
}