In order to help learn python, I decided to convert Michael's DoC python script to Java; the CoverageHistogram now spits out standard deviations for a good Gaussian fit.
This code eventually needs to end up in the VariantFiltration system - when we are ready to parameterize on the fly. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1528 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
544900aa99
commit
2241173fff
|
|
@ -53,13 +53,51 @@ public class CoverageHistogram extends LocusWalker<Integer,Integer>
|
||||||
public void onTraversalDone(Integer sum)
|
public void onTraversalDone(Integer sum)
|
||||||
{
|
{
|
||||||
double mean_coverage = (double)sum_coverage / (double)num_sites;
|
double mean_coverage = (double)sum_coverage / (double)num_sites;
|
||||||
|
double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - coverage_hist[0]));
|
||||||
out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites);
|
out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites);
|
||||||
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n%n", sum_coverage / ((double)(num_sites - coverage_hist[0])), num_sites - coverage_hist[0]);
|
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - coverage_hist[0]);
|
||||||
|
|
||||||
out.println("depth count freq");
|
// Code for calculting std devs adapted from Michael Melgar's python script
|
||||||
|
|
||||||
|
// Find the maximum extent of 'good' data
|
||||||
|
// First, find the mode
|
||||||
|
long maxValue = coverage_hist[1]; // ignore doc=0
|
||||||
|
int mode = 1;
|
||||||
|
for (int i = 2; i <= max_depth; i++) {
|
||||||
|
if (coverage_hist[i] > maxValue) {
|
||||||
|
maxValue = coverage_hist[i];
|
||||||
|
mode = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// now, procede to find end of good Gaussian fit
|
||||||
|
long dist = (long)Math.pow(10, 9);
|
||||||
|
while ( Math.abs(coverage_hist[mode] - coverage_hist[1]) < dist )
|
||||||
|
dist = Math.abs(coverage_hist[mode++] - coverage_hist[1]);
|
||||||
|
int maxGoodDepth = mode + 1;
|
||||||
|
|
||||||
|
// calculate the mean of the good region
|
||||||
|
long totalGoodSites = 0, totalGoodDepth = 0;
|
||||||
|
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||||
|
totalGoodSites += coverage_hist[i];
|
||||||
|
totalGoodDepth += i * coverage_hist[i];
|
||||||
|
}
|
||||||
|
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
|
||||||
|
|
||||||
|
// calculate the variance and standard deviation of the good region
|
||||||
|
double var = 0.0;
|
||||||
|
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||||
|
var += coverage_hist[i] * Math.pow(meanGoodDepth - (double)i, 2);
|
||||||
|
}
|
||||||
|
double stdev = Math.sqrt(var / (double)totalGoodSites);
|
||||||
|
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
|
||||||
|
|
||||||
|
for (int i = 1; i <= 5; i++)
|
||||||
|
out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev));
|
||||||
|
|
||||||
|
out.println("\ndepth count freq(percent)");
|
||||||
for (int i = 0; i <= max_depth; i++)
|
for (int i = 0; i <= max_depth; i++)
|
||||||
{
|
{
|
||||||
out.printf("%d %d %f\n", i, coverage_hist[i], (100.0*coverage_hist[i]) / num_sites);
|
out.printf("%d %d %f\n", i, coverage_hist[i], (100.0*coverage_hist[i]) / (double)num_sites);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue