1. Merged CoverageHistogram into DepthOfCoverageWalker

2. Fixed bug in histogram calculation for small intervals
3. Better output in DoCWalker
4. Comments added to code



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2245 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-03 17:01:53 +00:00
parent 44b9f60735
commit 01cf5cc741
3 changed files with 110 additions and 135 deletions

View File

@ -60,20 +60,37 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
@Argument(fullName="bySample", shortName="bySample", doc="List read depths for each sample")
protected boolean bySample = false;
@Argument(fullName="printHistogram", shortName="histogram", doc="Print a histogram of the coverage")
protected boolean printHistogram = false;
// keep track of the read group and sample names
private TreeSet<String> readGroupNames = new TreeSet<String>();
private TreeSet<String> sampleNames = new TreeSet<String>();
// keep track of the histogram data
private ExpandingArrayList<Integer> coverageHist = null;
private int maxDepth = 0;
private int totalLoci = 0;
// we want to see reads with deletions
public boolean includeReadsWithDeletionAtLoci() { return true; }
public void initialize() {
// initialize histogram array
if ( printHistogram ) {
coverageHist = new ExpandingArrayList<Integer>();
}
// initialize read group names from BAM header
if ( byReadGroup ) {
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
for ( SAMReadGroupRecord record : readGroups )
readGroupNames.add(record.getReadGroupId());
}
// initialize sample names from BAM header
if ( bySample ) {
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
for ( SAMReadGroupRecord record : readGroups ) {
@ -83,6 +100,8 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
}
}
// build and print the per-locus header
out.println("\nPER_LOCUS_COVERAGE_SECTION");
StringBuilder header = new StringBuilder("location\ttotal_coverage\tcoverage_without_deletions");
if ( excludeMAPQBelowThis > 0 ) {
header.append("\tcoverage_atleast_MQ");
@ -105,6 +124,8 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
public DoCInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
// fill in and print all of the per-locus coverage data, then return it to reduce
ReadBackedPileup pileup = context.getPileup();
DoCInfo info = new DoCInfo();
@ -141,6 +162,10 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
if ( excludeMAPQBelowThis > 0 )
info.numBadMQReads = nBadMAPQReads;
// if we need to print the histogram, fill in the data
if ( printHistogram )
incCov(info.totalCoverage);
printDoCInfo(context.getLocation(), info, false);
return info;
@ -153,6 +178,9 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
public DoCInfo reduceInit() { return new DoCInfo(); }
public DoCInfo reduce(DoCInfo value, DoCInfo sum) {
// combine all of the per-locus data for a given interval
sum.totalCoverage += value.totalCoverage;
sum.numDeletions += value.numDeletions;
sum.numBadMQReads += value.numBadMQReads;
@ -178,7 +206,10 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
@Override
public void onTraversalDone(List<Pair<GenomeLoc, DoCInfo>> results) {
StringBuilder header = new StringBuilder("\nlocation\ttotal_coverage\taverage_coverage\tcoverage_without_deletions\taverage_coverage_without_deletions");
// build and print the per-interval header
out.println("\n\nPER_INTERVAL_COVERAGE_SECTION");
StringBuilder header = new StringBuilder("location\ttotal_coverage\taverage_coverage\tcoverage_without_deletions\taverage_coverage_without_deletions");
if ( excludeMAPQBelowThis > 0 ) {
header.append("\tcoverage_atleast_MQ");
header.append(excludeMAPQBelowThis);
@ -203,8 +234,77 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
}
out.println(header.toString());
// print all of the individual per-interval coverage data
for ( Pair<GenomeLoc, DoCInfo> result : results )
printDoCInfo(result.first, result.second, true);
// if we need to print the histogram, do so now
if ( printHistogram )
printHisto();
}
private void incCov(int depth) {
int c = coverageHist.expandingGet(depth, 0);
coverageHist.set(depth, c + 1);
if ( depth > maxDepth )
maxDepth = depth;
totalLoci++;
}
private int getCov(int depth) {
return coverageHist.get(depth);
}
private void printHisto() {
// sanity check
if ( totalLoci == 0 )
return;
// Code for calculting std devs adapted from Michael Melgar's python script
// Find the maximum extent of 'good' data
// First, find the mode
long maxValue = getCov(1); // ignore doc=0
int mode = 1;
for (int i = 2; i <= maxDepth; i++) {
if ( getCov(i) > maxValue ) {
maxValue = getCov(i);
mode = i;
}
}
// now, procede to find end of good Gaussian fit
long dist = (long)Math.pow(10, 9);
while ( Math.abs(getCov(mode) - getCov(1)) < dist && mode < maxDepth )
dist = Math.abs(getCov(mode++) - getCov(1));
int maxGoodDepth = Math.min(mode + 1, maxDepth);
// calculate the mean of the good region
long totalGoodSites = 0, totalGoodDepth = 0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
totalGoodSites += getCov(i);
totalGoodDepth += i * getCov(i);
}
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
// calculate the variance and standard deviation of the good region
double var = 0.0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2);
}
double stdev = Math.sqrt(var / (double)totalGoodSites);
// print
out.println("\n\nHISTOGRAM_SECTION");
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
for (int i = 1; i <= 5; i++)
out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev));
out.println("\ndepth count freq(percent)");
for (int i = 0; i <= maxDepth; i++)
out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)totalLoci);
}
private void printDoCInfo(GenomeLoc loc, DoCInfo info, boolean printAverageCoverage) {

View File

@ -1,126 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.utils.ExpandingArrayList;
import java.util.*;
// Plot a histogram of depth of coverage
// j.maguire 6-11-2009
@By(DataSource.REFERENCE)
public class CoverageHistogram extends LocusWalker<Integer,Integer>
{
//@Argument(fullName="start", shortName="start", required=false, doc="start") public Integer START = 0;
// Private state.
//long[] coverage_hist;
ExpandingArrayList<Integer> coverage_hist = new ExpandingArrayList<Integer>();
int max_depth = 0;
long sum_coverage = 0;
long num_sites = 0;
/////////
// Walker Interface Functions
public void initialize()
{
//coverage_hist = new long[1000000];
//Arrays.fill(coverage_hist, 0);
}
private void incCov(int depth) {
int c = coverage_hist.expandingGet(depth, 0);
coverage_hist.set(depth, c + 1);
}
private int getCov(int depth) {
return coverage_hist.get(depth);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context)
{
if (ref.getBase() == 'N') { return null; }
int depth = context.getReads().size();
incCov(depth);
if (depth > max_depth) { max_depth = depth; }
sum_coverage += depth;
num_sites += 1;
return 0;
}
public void onTraversalDone(Integer sum)
{
double mean_coverage = (double)sum_coverage / (double)num_sites;
double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - getCov(0)));
out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites);
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - getCov(0));
// Code for calculting std devs adapted from Michael Melgar's python script
// Find the maximum extent of 'good' data
// First, find the mode
long maxValue = getCov(1); // ignore doc=0
int mode = 1;
for (int i = 2; i <= max_depth; i++) {
if (getCov(i) > maxValue) {
maxValue = getCov(i);
mode = i;
}
}
// now, procede to find end of good Gaussian fit
long dist = (long)Math.pow(10, 9);
while ( Math.abs(getCov(mode) - getCov(1)) < dist )
dist = Math.abs(getCov(mode++) - getCov(1));
int maxGoodDepth = mode + 1;
// calculate the mean of the good region
long totalGoodSites = 0, totalGoodDepth = 0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
totalGoodSites += getCov(i);
totalGoodDepth += i * getCov(i);
}
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
// calculate the variance and standard deviation of the good region
double var = 0.0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2);
}
double stdev = Math.sqrt(var / (double)totalGoodSites);
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
for (int i = 1; i <= 5; i++)
out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev));
out.println("\ndepth count freq(percent)");
for (int i = 0; i <= max_depth; i++)
{
out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)num_sites);
}
}
public Integer reduceInit()
{
return 0;
}
public Integer reduce(Integer record, Integer sum)
{
return 0;
}
// END Walker Interface Functions
/////////
}

View File

@ -13,13 +13,14 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
private static String root = "-L 1:10,164,500-10,164,520 -R /broad/1KG/reference/human_b36_both.fasta -T DepthOfCoverage -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam";
static HashMap<String, String> expectations = new HashMap<String, String>();
static {
expectations.put("-minMAPQ 1", "59c6071105a598e19f460640c35768c6");
expectations.put("-minMAPQ 100", "e997fb5d61eaec21518722b0de90af20");
expectations.put("-minDepth 8", "3e50afef0e751119cd27c324bdfae544");
expectations.put("-minDepth 10", "d4c336d9e748347e1082bbc92d2489a3");
expectations.put("-bySample", "160ffa185dbfa8b0d2dc57f60f5b1e48");
expectations.put("-byRG", "dd3b4d040df7325dad4760ac6fa5252d");
expectations.put("-minMAPQ 1 -bySample -byRG", "bd2a07ef548b86e82ac6cce534225612");
expectations.put("-minMAPQ 1", "8b73fad5cce4620907d5da2a985219d5");
expectations.put("-minMAPQ 100", "1a959892d8ad0523dac2fb097eacb3c2");
expectations.put("-minDepth 8", "6e8c6b6d78962d110c87ad905fa5b664");
expectations.put("-minDepth 10", "14399e1237866540af3f1aee149030d0");
expectations.put("-bySample", "93358437153b4d65bdff747e33de1d63");
expectations.put("-byRG", "777e8427eb4bdad300b23800cb7b0592");
expectations.put("-histogram", "96f15e1d9d598d48191e20ee84715d46");
expectations.put("-minMAPQ 1 -bySample -byRG -minDepth 8 -histogram", "783b0bc83c54d883efa8383a379ff17b");
}
@Test
@ -41,7 +42,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T DepthOfCoverage -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam -L 1:10,001,890-10,001,895 -o %s",
1, // just one output file
Arrays.asList("51203ba5ab928449cd01363af0b91510"));
Arrays.asList("a332d1539b29dff615b198818a3d4dd1"));
executeTest("testDepthOfCoverage454", spec);
}
}