Now uses expanding array list for coverage histograms. No hard limit on maximum depth now
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1643 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4ad46590a3
commit
73bec6f36d
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -19,29 +20,37 @@ public class CoverageHistogram extends LocusWalker<Integer,Integer>
|
|||
//@Argument(fullName="start", shortName="start", required=false, doc="start") public Integer START = 0;
|
||||
|
||||
// Private state.
|
||||
long[] coverage_hist;
|
||||
int max_depth;
|
||||
|
||||
long sum_coverage;
|
||||
long num_sites;
|
||||
//long[] coverage_hist;
|
||||
ExpandingArrayList<Integer> coverage_hist = new ExpandingArrayList<Integer>();
|
||||
int max_depth = 0;
|
||||
|
||||
long sum_coverage = 0;
|
||||
long num_sites = 0;
|
||||
|
||||
/////////
|
||||
// Walker Interface Functions
|
||||
public void initialize()
|
||||
{
|
||||
coverage_hist = new long[1000000];
|
||||
Arrays.fill(coverage_hist, 0);
|
||||
max_depth = 0;
|
||||
|
||||
sum_coverage = 0;
|
||||
num_sites = 0;
|
||||
//coverage_hist = new long[1000000];
|
||||
//Arrays.fill(coverage_hist, 0);
|
||||
}
|
||||
|
||||
private void incCov(int depth) {
|
||||
int c = coverage_hist.expandingGet(depth, 0);
|
||||
coverage_hist.set(depth, c + 1);
|
||||
}
|
||||
|
||||
private int getCov(int depth) {
|
||||
return coverage_hist.get(depth);
|
||||
}
|
||||
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context)
|
||||
{
|
||||
if (ref.getBase() == 'N') { return null; }
|
||||
int depth = context.getReads().size();
|
||||
coverage_hist[depth] += 1;
|
||||
incCov(depth);
|
||||
if (depth > max_depth) { max_depth = depth; }
|
||||
|
||||
sum_coverage += depth;
|
||||
|
|
@ -53,40 +62,40 @@ public class CoverageHistogram extends LocusWalker<Integer,Integer>
|
|||
public void onTraversalDone(Integer sum)
|
||||
{
|
||||
double mean_coverage = (double)sum_coverage / (double)num_sites;
|
||||
double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - coverage_hist[0]));
|
||||
double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - getCov(0)));
|
||||
out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites);
|
||||
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - coverage_hist[0]);
|
||||
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - getCov(0));
|
||||
|
||||
// Code for calculting std devs adapted from Michael Melgar's python script
|
||||
|
||||
// Find the maximum extent of 'good' data
|
||||
// First, find the mode
|
||||
long maxValue = coverage_hist[1]; // ignore doc=0
|
||||
long maxValue = getCov(1); // ignore doc=0
|
||||
int mode = 1;
|
||||
for (int i = 2; i <= max_depth; i++) {
|
||||
if (coverage_hist[i] > maxValue) {
|
||||
maxValue = coverage_hist[i];
|
||||
if (getCov(i) > maxValue) {
|
||||
maxValue = getCov(i);
|
||||
mode = i;
|
||||
}
|
||||
}
|
||||
// now, procede to find end of good Gaussian fit
|
||||
long dist = (long)Math.pow(10, 9);
|
||||
while ( Math.abs(coverage_hist[mode] - coverage_hist[1]) < dist )
|
||||
dist = Math.abs(coverage_hist[mode++] - coverage_hist[1]);
|
||||
while ( Math.abs(getCov(mode) - getCov(1)) < dist )
|
||||
dist = Math.abs(getCov(mode++) - getCov(1));
|
||||
int maxGoodDepth = mode + 1;
|
||||
|
||||
// calculate the mean of the good region
|
||||
long totalGoodSites = 0, totalGoodDepth = 0;
|
||||
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||
totalGoodSites += coverage_hist[i];
|
||||
totalGoodDepth += i * coverage_hist[i];
|
||||
totalGoodSites += getCov(i);
|
||||
totalGoodDepth += i * getCov(i);
|
||||
}
|
||||
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
|
||||
|
||||
// calculate the variance and standard deviation of the good region
|
||||
double var = 0.0;
|
||||
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||
var += coverage_hist[i] * Math.pow(meanGoodDepth - (double)i, 2);
|
||||
var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2);
|
||||
}
|
||||
double stdev = Math.sqrt(var / (double)totalGoodSites);
|
||||
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
|
||||
|
|
@ -97,9 +106,9 @@ public class CoverageHistogram extends LocusWalker<Integer,Integer>
|
|||
out.println("\ndepth count freq(percent)");
|
||||
for (int i = 0; i <= max_depth; i++)
|
||||
{
|
||||
out.printf("%d %d %f\n", i, coverage_hist[i], (100.0*coverage_hist[i]) / (double)num_sites);
|
||||
out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)num_sites);
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
public Integer reduceInit()
|
||||
|
|
|
|||
|
|
@ -22,13 +22,22 @@ public class ExpandingArrayList<E> extends ArrayList<E> {
|
|||
return null;
|
||||
}
|
||||
|
||||
public E set(int index, E element) {
|
||||
public E expandingGet(int index, E default_value) throws IndexOutOfBoundsException {
|
||||
maybeExpand(index, default_value);
|
||||
return super.get(index);
|
||||
}
|
||||
|
||||
private void maybeExpand(int index, E value) {
|
||||
if ( index >= size() ) {
|
||||
// We need to add null items until we can safely set index to element
|
||||
for ( int i = size(); i <= index; i++ )
|
||||
add(null);
|
||||
add(value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public E set(int index, E element) {
|
||||
maybeExpand(index, null);
|
||||
return super.set(index, element);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue