In order to help learn python, I decided to convert Michael's DoC python script to Java; the CoverageHistogram now spits out standard deviations for a good Gaussian fit.

This code eventually needs to end up in the VariantFiltration system - when we are ready to parameterize on the fly.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1528 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-09-04 02:23:57 +00:00
parent 544900aa99
commit 2241173fff
1 changed files with 42 additions and 4 deletions

View File

@ -52,14 +52,52 @@ public class CoverageHistogram extends LocusWalker<Integer,Integer>
public void onTraversalDone(Integer sum)
{
double mean_coverage = (double)sum_coverage / (double)num_sites;
double mean_coverage = (double)sum_coverage / (double)num_sites;
double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - coverage_hist[0]));
out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites);
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n%n", sum_coverage / ((double)(num_sites - coverage_hist[0])), num_sites - coverage_hist[0]);
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - coverage_hist[0]);
out.println("depth count freq");
// Code for calculting std devs adapted from Michael Melgar's python script
// Find the maximum extent of 'good' data
// First, find the mode
long maxValue = coverage_hist[1]; // ignore doc=0
int mode = 1;
for (int i = 2; i <= max_depth; i++) {
if (coverage_hist[i] > maxValue) {
maxValue = coverage_hist[i];
mode = i;
}
}
// now, procede to find end of good Gaussian fit
long dist = (long)Math.pow(10, 9);
while ( Math.abs(coverage_hist[mode] - coverage_hist[1]) < dist )
dist = Math.abs(coverage_hist[mode++] - coverage_hist[1]);
int maxGoodDepth = mode + 1;
// calculate the mean of the good region
long totalGoodSites = 0, totalGoodDepth = 0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
totalGoodSites += coverage_hist[i];
totalGoodDepth += i * coverage_hist[i];
}
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
// calculate the variance and standard deviation of the good region
double var = 0.0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
var += coverage_hist[i] * Math.pow(meanGoodDepth - (double)i, 2);
}
double stdev = Math.sqrt(var / (double)totalGoodSites);
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
for (int i = 1; i <= 5; i++)
out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev));
out.println("\ndepth count freq(percent)");
for (int i = 0; i <= max_depth; i++)
{
out.printf("%d %d %f\n", i, coverage_hist[i], (100.0*coverage_hist[i]) / num_sites);
out.printf("%d %d %f\n", i, coverage_hist[i], (100.0*coverage_hist[i]) / (double)num_sites);
}
return;
}