1. Merged CoverageHistogram into DepthOfCoverageWalker
2. Fixed bug in histogram calculation for small intervals 3. Better output in DoCWalker 4. Comments added to code git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2245 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
44b9f60735
commit
01cf5cc741
|
|
@ -60,20 +60,37 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
|||
@Argument(fullName="bySample", shortName="bySample", doc="List read depths for each sample")
|
||||
protected boolean bySample = false;
|
||||
|
||||
@Argument(fullName="printHistogram", shortName="histogram", doc="Print a histogram of the coverage")
|
||||
protected boolean printHistogram = false;
|
||||
|
||||
|
||||
// keep track of the read group and sample names
|
||||
private TreeSet<String> readGroupNames = new TreeSet<String>();
|
||||
private TreeSet<String> sampleNames = new TreeSet<String>();
|
||||
|
||||
// keep track of the histogram data
|
||||
private ExpandingArrayList<Integer> coverageHist = null;
|
||||
private int maxDepth = 0;
|
||||
private int totalLoci = 0;
|
||||
|
||||
// we want to see reads with deletions
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
public void initialize() {
|
||||
|
||||
// initialize histogram array
|
||||
if ( printHistogram ) {
|
||||
coverageHist = new ExpandingArrayList<Integer>();
|
||||
}
|
||||
|
||||
// initialize read group names from BAM header
|
||||
if ( byReadGroup ) {
|
||||
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
|
||||
for ( SAMReadGroupRecord record : readGroups )
|
||||
readGroupNames.add(record.getReadGroupId());
|
||||
}
|
||||
|
||||
// initialize sample names from BAM header
|
||||
if ( bySample ) {
|
||||
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
|
||||
for ( SAMReadGroupRecord record : readGroups ) {
|
||||
|
|
@ -83,6 +100,8 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
|||
}
|
||||
}
|
||||
|
||||
// build and print the per-locus header
|
||||
out.println("\nPER_LOCUS_COVERAGE_SECTION");
|
||||
StringBuilder header = new StringBuilder("location\ttotal_coverage\tcoverage_without_deletions");
|
||||
if ( excludeMAPQBelowThis > 0 ) {
|
||||
header.append("\tcoverage_atleast_MQ");
|
||||
|
|
@ -105,6 +124,8 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
|||
|
||||
public DoCInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
// fill in and print all of the per-locus coverage data, then return it to reduce
|
||||
|
||||
ReadBackedPileup pileup = context.getPileup();
|
||||
|
||||
DoCInfo info = new DoCInfo();
|
||||
|
|
@ -141,6 +162,10 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
|||
if ( excludeMAPQBelowThis > 0 )
|
||||
info.numBadMQReads = nBadMAPQReads;
|
||||
|
||||
// if we need to print the histogram, fill in the data
|
||||
if ( printHistogram )
|
||||
incCov(info.totalCoverage);
|
||||
|
||||
printDoCInfo(context.getLocation(), info, false);
|
||||
|
||||
return info;
|
||||
|
|
@ -153,6 +178,9 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
|||
public DoCInfo reduceInit() { return new DoCInfo(); }
|
||||
|
||||
public DoCInfo reduce(DoCInfo value, DoCInfo sum) {
|
||||
|
||||
// combine all of the per-locus data for a given interval
|
||||
|
||||
sum.totalCoverage += value.totalCoverage;
|
||||
sum.numDeletions += value.numDeletions;
|
||||
sum.numBadMQReads += value.numBadMQReads;
|
||||
|
|
@ -178,7 +206,10 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
|||
@Override
|
||||
public void onTraversalDone(List<Pair<GenomeLoc, DoCInfo>> results) {
|
||||
|
||||
StringBuilder header = new StringBuilder("\nlocation\ttotal_coverage\taverage_coverage\tcoverage_without_deletions\taverage_coverage_without_deletions");
|
||||
// build and print the per-interval header
|
||||
out.println("\n\nPER_INTERVAL_COVERAGE_SECTION");
|
||||
|
||||
StringBuilder header = new StringBuilder("location\ttotal_coverage\taverage_coverage\tcoverage_without_deletions\taverage_coverage_without_deletions");
|
||||
if ( excludeMAPQBelowThis > 0 ) {
|
||||
header.append("\tcoverage_atleast_MQ");
|
||||
header.append(excludeMAPQBelowThis);
|
||||
|
|
@ -203,8 +234,77 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
|||
}
|
||||
out.println(header.toString());
|
||||
|
||||
// print all of the individual per-interval coverage data
|
||||
for ( Pair<GenomeLoc, DoCInfo> result : results )
|
||||
printDoCInfo(result.first, result.second, true);
|
||||
|
||||
// if we need to print the histogram, do so now
|
||||
if ( printHistogram )
|
||||
printHisto();
|
||||
}
|
||||
|
||||
private void incCov(int depth) {
|
||||
int c = coverageHist.expandingGet(depth, 0);
|
||||
coverageHist.set(depth, c + 1);
|
||||
if ( depth > maxDepth )
|
||||
maxDepth = depth;
|
||||
totalLoci++;
|
||||
}
|
||||
|
||||
private int getCov(int depth) {
|
||||
return coverageHist.get(depth);
|
||||
}
|
||||
|
||||
private void printHisto() {
|
||||
|
||||
// sanity check
|
||||
if ( totalLoci == 0 )
|
||||
return;
|
||||
|
||||
// Code for calculting std devs adapted from Michael Melgar's python script
|
||||
|
||||
// Find the maximum extent of 'good' data
|
||||
// First, find the mode
|
||||
long maxValue = getCov(1); // ignore doc=0
|
||||
int mode = 1;
|
||||
for (int i = 2; i <= maxDepth; i++) {
|
||||
if ( getCov(i) > maxValue ) {
|
||||
maxValue = getCov(i);
|
||||
mode = i;
|
||||
}
|
||||
}
|
||||
|
||||
// now, procede to find end of good Gaussian fit
|
||||
long dist = (long)Math.pow(10, 9);
|
||||
while ( Math.abs(getCov(mode) - getCov(1)) < dist && mode < maxDepth )
|
||||
dist = Math.abs(getCov(mode++) - getCov(1));
|
||||
int maxGoodDepth = Math.min(mode + 1, maxDepth);
|
||||
|
||||
// calculate the mean of the good region
|
||||
long totalGoodSites = 0, totalGoodDepth = 0;
|
||||
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||
totalGoodSites += getCov(i);
|
||||
totalGoodDepth += i * getCov(i);
|
||||
}
|
||||
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
|
||||
|
||||
// calculate the variance and standard deviation of the good region
|
||||
double var = 0.0;
|
||||
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||
var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2);
|
||||
}
|
||||
double stdev = Math.sqrt(var / (double)totalGoodSites);
|
||||
|
||||
// print
|
||||
out.println("\n\nHISTOGRAM_SECTION");
|
||||
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
|
||||
|
||||
for (int i = 1; i <= 5; i++)
|
||||
out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev));
|
||||
|
||||
out.println("\ndepth count freq(percent)");
|
||||
for (int i = 0; i <= maxDepth; i++)
|
||||
out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)totalLoci);
|
||||
}
|
||||
|
||||
private void printDoCInfo(GenomeLoc loc, DoCInfo info, boolean printAverageCoverage) {
|
||||
|
|
|
|||
|
|
@ -1,126 +0,0 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
// Plot a histogram of depth of coverage
|
||||
// j.maguire 6-11-2009
|
||||
|
||||
@By(DataSource.REFERENCE)
|
||||
public class CoverageHistogram extends LocusWalker<Integer,Integer>
|
||||
{
|
||||
//@Argument(fullName="start", shortName="start", required=false, doc="start") public Integer START = 0;
|
||||
|
||||
// Private state.
|
||||
|
||||
//long[] coverage_hist;
|
||||
ExpandingArrayList<Integer> coverage_hist = new ExpandingArrayList<Integer>();
|
||||
int max_depth = 0;
|
||||
|
||||
long sum_coverage = 0;
|
||||
long num_sites = 0;
|
||||
|
||||
/////////
|
||||
// Walker Interface Functions
|
||||
public void initialize()
|
||||
{
|
||||
//coverage_hist = new long[1000000];
|
||||
//Arrays.fill(coverage_hist, 0);
|
||||
}
|
||||
|
||||
private void incCov(int depth) {
|
||||
int c = coverage_hist.expandingGet(depth, 0);
|
||||
coverage_hist.set(depth, c + 1);
|
||||
}
|
||||
|
||||
private int getCov(int depth) {
|
||||
return coverage_hist.get(depth);
|
||||
}
|
||||
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context)
|
||||
{
|
||||
if (ref.getBase() == 'N') { return null; }
|
||||
int depth = context.getReads().size();
|
||||
incCov(depth);
|
||||
if (depth > max_depth) { max_depth = depth; }
|
||||
|
||||
sum_coverage += depth;
|
||||
num_sites += 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer sum)
|
||||
{
|
||||
double mean_coverage = (double)sum_coverage / (double)num_sites;
|
||||
double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - getCov(0)));
|
||||
out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites);
|
||||
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - getCov(0));
|
||||
|
||||
// Code for calculting std devs adapted from Michael Melgar's python script
|
||||
|
||||
// Find the maximum extent of 'good' data
|
||||
// First, find the mode
|
||||
long maxValue = getCov(1); // ignore doc=0
|
||||
int mode = 1;
|
||||
for (int i = 2; i <= max_depth; i++) {
|
||||
if (getCov(i) > maxValue) {
|
||||
maxValue = getCov(i);
|
||||
mode = i;
|
||||
}
|
||||
}
|
||||
// now, procede to find end of good Gaussian fit
|
||||
long dist = (long)Math.pow(10, 9);
|
||||
while ( Math.abs(getCov(mode) - getCov(1)) < dist )
|
||||
dist = Math.abs(getCov(mode++) - getCov(1));
|
||||
int maxGoodDepth = mode + 1;
|
||||
|
||||
// calculate the mean of the good region
|
||||
long totalGoodSites = 0, totalGoodDepth = 0;
|
||||
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||
totalGoodSites += getCov(i);
|
||||
totalGoodDepth += i * getCov(i);
|
||||
}
|
||||
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
|
||||
|
||||
// calculate the variance and standard deviation of the good region
|
||||
double var = 0.0;
|
||||
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||
var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2);
|
||||
}
|
||||
double stdev = Math.sqrt(var / (double)totalGoodSites);
|
||||
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
|
||||
|
||||
for (int i = 1; i <= 5; i++)
|
||||
out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev));
|
||||
|
||||
out.println("\ndepth count freq(percent)");
|
||||
for (int i = 0; i <= max_depth; i++)
|
||||
{
|
||||
out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)num_sites);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Integer reduceInit()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer record, Integer sum)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// END Walker Interface Functions
|
||||
/////////
|
||||
}
|
||||
|
|
@ -13,13 +13,14 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
|
|||
private static String root = "-L 1:10,164,500-10,164,520 -R /broad/1KG/reference/human_b36_both.fasta -T DepthOfCoverage -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam";
|
||||
static HashMap<String, String> expectations = new HashMap<String, String>();
|
||||
static {
|
||||
expectations.put("-minMAPQ 1", "59c6071105a598e19f460640c35768c6");
|
||||
expectations.put("-minMAPQ 100", "e997fb5d61eaec21518722b0de90af20");
|
||||
expectations.put("-minDepth 8", "3e50afef0e751119cd27c324bdfae544");
|
||||
expectations.put("-minDepth 10", "d4c336d9e748347e1082bbc92d2489a3");
|
||||
expectations.put("-bySample", "160ffa185dbfa8b0d2dc57f60f5b1e48");
|
||||
expectations.put("-byRG", "dd3b4d040df7325dad4760ac6fa5252d");
|
||||
expectations.put("-minMAPQ 1 -bySample -byRG", "bd2a07ef548b86e82ac6cce534225612");
|
||||
expectations.put("-minMAPQ 1", "8b73fad5cce4620907d5da2a985219d5");
|
||||
expectations.put("-minMAPQ 100", "1a959892d8ad0523dac2fb097eacb3c2");
|
||||
expectations.put("-minDepth 8", "6e8c6b6d78962d110c87ad905fa5b664");
|
||||
expectations.put("-minDepth 10", "14399e1237866540af3f1aee149030d0");
|
||||
expectations.put("-bySample", "93358437153b4d65bdff747e33de1d63");
|
||||
expectations.put("-byRG", "777e8427eb4bdad300b23800cb7b0592");
|
||||
expectations.put("-histogram", "96f15e1d9d598d48191e20ee84715d46");
|
||||
expectations.put("-minMAPQ 1 -bySample -byRG -minDepth 8 -histogram", "783b0bc83c54d883efa8383a379ff17b");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -41,7 +42,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T DepthOfCoverage -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam -L 1:10,001,890-10,001,895 -o %s",
|
||||
1, // just one output file
|
||||
Arrays.asList("51203ba5ab928449cd01363af0b91510"));
|
||||
Arrays.asList("a332d1539b29dff615b198818a3d4dd1"));
|
||||
executeTest("testDepthOfCoverage454", spec);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue