1. Merged CoverageHistogram into DepthOfCoverageWalker
2. Fixed bug in histogram calculation for small intervals 3. Better output in DoCWalker 4. Comments added to code git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2245 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
44b9f60735
commit
01cf5cc741
|
|
@ -60,20 +60,37 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
||||||
@Argument(fullName="bySample", shortName="bySample", doc="List read depths for each sample")
|
@Argument(fullName="bySample", shortName="bySample", doc="List read depths for each sample")
|
||||||
protected boolean bySample = false;
|
protected boolean bySample = false;
|
||||||
|
|
||||||
|
@Argument(fullName="printHistogram", shortName="histogram", doc="Print a histogram of the coverage")
|
||||||
|
protected boolean printHistogram = false;
|
||||||
|
|
||||||
|
|
||||||
// keep track of the read group and sample names
|
// keep track of the read group and sample names
|
||||||
private TreeSet<String> readGroupNames = new TreeSet<String>();
|
private TreeSet<String> readGroupNames = new TreeSet<String>();
|
||||||
private TreeSet<String> sampleNames = new TreeSet<String>();
|
private TreeSet<String> sampleNames = new TreeSet<String>();
|
||||||
|
|
||||||
|
// keep track of the histogram data
|
||||||
|
private ExpandingArrayList<Integer> coverageHist = null;
|
||||||
|
private int maxDepth = 0;
|
||||||
|
private int totalLoci = 0;
|
||||||
|
|
||||||
|
// we want to see reads with deletions
|
||||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
||||||
|
// initialize histogram array
|
||||||
|
if ( printHistogram ) {
|
||||||
|
coverageHist = new ExpandingArrayList<Integer>();
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize read group names from BAM header
|
||||||
if ( byReadGroup ) {
|
if ( byReadGroup ) {
|
||||||
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
|
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
|
||||||
for ( SAMReadGroupRecord record : readGroups )
|
for ( SAMReadGroupRecord record : readGroups )
|
||||||
readGroupNames.add(record.getReadGroupId());
|
readGroupNames.add(record.getReadGroupId());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// initialize sample names from BAM header
|
||||||
if ( bySample ) {
|
if ( bySample ) {
|
||||||
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
|
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
|
||||||
for ( SAMReadGroupRecord record : readGroups ) {
|
for ( SAMReadGroupRecord record : readGroups ) {
|
||||||
|
|
@ -83,6 +100,8 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// build and print the per-locus header
|
||||||
|
out.println("\nPER_LOCUS_COVERAGE_SECTION");
|
||||||
StringBuilder header = new StringBuilder("location\ttotal_coverage\tcoverage_without_deletions");
|
StringBuilder header = new StringBuilder("location\ttotal_coverage\tcoverage_without_deletions");
|
||||||
if ( excludeMAPQBelowThis > 0 ) {
|
if ( excludeMAPQBelowThis > 0 ) {
|
||||||
header.append("\tcoverage_atleast_MQ");
|
header.append("\tcoverage_atleast_MQ");
|
||||||
|
|
@ -105,6 +124,8 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
||||||
|
|
||||||
public DoCInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public DoCInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
|
||||||
|
// fill in and print all of the per-locus coverage data, then return it to reduce
|
||||||
|
|
||||||
ReadBackedPileup pileup = context.getPileup();
|
ReadBackedPileup pileup = context.getPileup();
|
||||||
|
|
||||||
DoCInfo info = new DoCInfo();
|
DoCInfo info = new DoCInfo();
|
||||||
|
|
@ -141,6 +162,10 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
||||||
if ( excludeMAPQBelowThis > 0 )
|
if ( excludeMAPQBelowThis > 0 )
|
||||||
info.numBadMQReads = nBadMAPQReads;
|
info.numBadMQReads = nBadMAPQReads;
|
||||||
|
|
||||||
|
// if we need to print the histogram, fill in the data
|
||||||
|
if ( printHistogram )
|
||||||
|
incCov(info.totalCoverage);
|
||||||
|
|
||||||
printDoCInfo(context.getLocation(), info, false);
|
printDoCInfo(context.getLocation(), info, false);
|
||||||
|
|
||||||
return info;
|
return info;
|
||||||
|
|
@ -153,6 +178,9 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
||||||
public DoCInfo reduceInit() { return new DoCInfo(); }
|
public DoCInfo reduceInit() { return new DoCInfo(); }
|
||||||
|
|
||||||
public DoCInfo reduce(DoCInfo value, DoCInfo sum) {
|
public DoCInfo reduce(DoCInfo value, DoCInfo sum) {
|
||||||
|
|
||||||
|
// combine all of the per-locus data for a given interval
|
||||||
|
|
||||||
sum.totalCoverage += value.totalCoverage;
|
sum.totalCoverage += value.totalCoverage;
|
||||||
sum.numDeletions += value.numDeletions;
|
sum.numDeletions += value.numDeletions;
|
||||||
sum.numBadMQReads += value.numBadMQReads;
|
sum.numBadMQReads += value.numBadMQReads;
|
||||||
|
|
@ -178,7 +206,10 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
||||||
@Override
|
@Override
|
||||||
public void onTraversalDone(List<Pair<GenomeLoc, DoCInfo>> results) {
|
public void onTraversalDone(List<Pair<GenomeLoc, DoCInfo>> results) {
|
||||||
|
|
||||||
StringBuilder header = new StringBuilder("\nlocation\ttotal_coverage\taverage_coverage\tcoverage_without_deletions\taverage_coverage_without_deletions");
|
// build and print the per-interval header
|
||||||
|
out.println("\n\nPER_INTERVAL_COVERAGE_SECTION");
|
||||||
|
|
||||||
|
StringBuilder header = new StringBuilder("location\ttotal_coverage\taverage_coverage\tcoverage_without_deletions\taverage_coverage_without_deletions");
|
||||||
if ( excludeMAPQBelowThis > 0 ) {
|
if ( excludeMAPQBelowThis > 0 ) {
|
||||||
header.append("\tcoverage_atleast_MQ");
|
header.append("\tcoverage_atleast_MQ");
|
||||||
header.append(excludeMAPQBelowThis);
|
header.append(excludeMAPQBelowThis);
|
||||||
|
|
@ -203,8 +234,77 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
||||||
}
|
}
|
||||||
out.println(header.toString());
|
out.println(header.toString());
|
||||||
|
|
||||||
|
// print all of the individual per-interval coverage data
|
||||||
for ( Pair<GenomeLoc, DoCInfo> result : results )
|
for ( Pair<GenomeLoc, DoCInfo> result : results )
|
||||||
printDoCInfo(result.first, result.second, true);
|
printDoCInfo(result.first, result.second, true);
|
||||||
|
|
||||||
|
// if we need to print the histogram, do so now
|
||||||
|
if ( printHistogram )
|
||||||
|
printHisto();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void incCov(int depth) {
|
||||||
|
int c = coverageHist.expandingGet(depth, 0);
|
||||||
|
coverageHist.set(depth, c + 1);
|
||||||
|
if ( depth > maxDepth )
|
||||||
|
maxDepth = depth;
|
||||||
|
totalLoci++;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getCov(int depth) {
|
||||||
|
return coverageHist.get(depth);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void printHisto() {
|
||||||
|
|
||||||
|
// sanity check
|
||||||
|
if ( totalLoci == 0 )
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Code for calculting std devs adapted from Michael Melgar's python script
|
||||||
|
|
||||||
|
// Find the maximum extent of 'good' data
|
||||||
|
// First, find the mode
|
||||||
|
long maxValue = getCov(1); // ignore doc=0
|
||||||
|
int mode = 1;
|
||||||
|
for (int i = 2; i <= maxDepth; i++) {
|
||||||
|
if ( getCov(i) > maxValue ) {
|
||||||
|
maxValue = getCov(i);
|
||||||
|
mode = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// now, procede to find end of good Gaussian fit
|
||||||
|
long dist = (long)Math.pow(10, 9);
|
||||||
|
while ( Math.abs(getCov(mode) - getCov(1)) < dist && mode < maxDepth )
|
||||||
|
dist = Math.abs(getCov(mode++) - getCov(1));
|
||||||
|
int maxGoodDepth = Math.min(mode + 1, maxDepth);
|
||||||
|
|
||||||
|
// calculate the mean of the good region
|
||||||
|
long totalGoodSites = 0, totalGoodDepth = 0;
|
||||||
|
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||||
|
totalGoodSites += getCov(i);
|
||||||
|
totalGoodDepth += i * getCov(i);
|
||||||
|
}
|
||||||
|
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
|
||||||
|
|
||||||
|
// calculate the variance and standard deviation of the good region
|
||||||
|
double var = 0.0;
|
||||||
|
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
||||||
|
var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2);
|
||||||
|
}
|
||||||
|
double stdev = Math.sqrt(var / (double)totalGoodSites);
|
||||||
|
|
||||||
|
// print
|
||||||
|
out.println("\n\nHISTOGRAM_SECTION");
|
||||||
|
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
|
||||||
|
|
||||||
|
for (int i = 1; i <= 5; i++)
|
||||||
|
out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev));
|
||||||
|
|
||||||
|
out.println("\ndepth count freq(percent)");
|
||||||
|
for (int i = 0; i <= maxDepth; i++)
|
||||||
|
out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)totalLoci);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void printDoCInfo(GenomeLoc loc, DoCInfo info, boolean printAverageCoverage) {
|
private void printDoCInfo(GenomeLoc loc, DoCInfo info, boolean printAverageCoverage) {
|
||||||
|
|
|
||||||
|
|
@ -1,126 +0,0 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.By;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
|
||||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
// Plot a histogram of depth of coverage
|
|
||||||
// j.maguire 6-11-2009
|
|
||||||
|
|
||||||
@By(DataSource.REFERENCE)
|
|
||||||
public class CoverageHistogram extends LocusWalker<Integer,Integer>
|
|
||||||
{
|
|
||||||
//@Argument(fullName="start", shortName="start", required=false, doc="start") public Integer START = 0;
|
|
||||||
|
|
||||||
// Private state.
|
|
||||||
|
|
||||||
//long[] coverage_hist;
|
|
||||||
ExpandingArrayList<Integer> coverage_hist = new ExpandingArrayList<Integer>();
|
|
||||||
int max_depth = 0;
|
|
||||||
|
|
||||||
long sum_coverage = 0;
|
|
||||||
long num_sites = 0;
|
|
||||||
|
|
||||||
/////////
|
|
||||||
// Walker Interface Functions
|
|
||||||
public void initialize()
|
|
||||||
{
|
|
||||||
//coverage_hist = new long[1000000];
|
|
||||||
//Arrays.fill(coverage_hist, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void incCov(int depth) {
|
|
||||||
int c = coverage_hist.expandingGet(depth, 0);
|
|
||||||
coverage_hist.set(depth, c + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getCov(int depth) {
|
|
||||||
return coverage_hist.get(depth);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context)
|
|
||||||
{
|
|
||||||
if (ref.getBase() == 'N') { return null; }
|
|
||||||
int depth = context.getReads().size();
|
|
||||||
incCov(depth);
|
|
||||||
if (depth > max_depth) { max_depth = depth; }
|
|
||||||
|
|
||||||
sum_coverage += depth;
|
|
||||||
num_sites += 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void onTraversalDone(Integer sum)
|
|
||||||
{
|
|
||||||
double mean_coverage = (double)sum_coverage / (double)num_sites;
|
|
||||||
double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - getCov(0)));
|
|
||||||
out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites);
|
|
||||||
out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - getCov(0));
|
|
||||||
|
|
||||||
// Code for calculting std devs adapted from Michael Melgar's python script
|
|
||||||
|
|
||||||
// Find the maximum extent of 'good' data
|
|
||||||
// First, find the mode
|
|
||||||
long maxValue = getCov(1); // ignore doc=0
|
|
||||||
int mode = 1;
|
|
||||||
for (int i = 2; i <= max_depth; i++) {
|
|
||||||
if (getCov(i) > maxValue) {
|
|
||||||
maxValue = getCov(i);
|
|
||||||
mode = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// now, procede to find end of good Gaussian fit
|
|
||||||
long dist = (long)Math.pow(10, 9);
|
|
||||||
while ( Math.abs(getCov(mode) - getCov(1)) < dist )
|
|
||||||
dist = Math.abs(getCov(mode++) - getCov(1));
|
|
||||||
int maxGoodDepth = mode + 1;
|
|
||||||
|
|
||||||
// calculate the mean of the good region
|
|
||||||
long totalGoodSites = 0, totalGoodDepth = 0;
|
|
||||||
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
|
||||||
totalGoodSites += getCov(i);
|
|
||||||
totalGoodDepth += i * getCov(i);
|
|
||||||
}
|
|
||||||
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
|
|
||||||
|
|
||||||
// calculate the variance and standard deviation of the good region
|
|
||||||
double var = 0.0;
|
|
||||||
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
|
|
||||||
var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2);
|
|
||||||
}
|
|
||||||
double stdev = Math.sqrt(var / (double)totalGoodSites);
|
|
||||||
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
|
|
||||||
|
|
||||||
for (int i = 1; i <= 5; i++)
|
|
||||||
out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev));
|
|
||||||
|
|
||||||
out.println("\ndepth count freq(percent)");
|
|
||||||
for (int i = 0; i <= max_depth; i++)
|
|
||||||
{
|
|
||||||
out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)num_sites);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduceInit()
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduce(Integer record, Integer sum)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// END Walker Interface Functions
|
|
||||||
/////////
|
|
||||||
}
|
|
||||||
|
|
@ -13,13 +13,14 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
|
||||||
private static String root = "-L 1:10,164,500-10,164,520 -R /broad/1KG/reference/human_b36_both.fasta -T DepthOfCoverage -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam";
|
private static String root = "-L 1:10,164,500-10,164,520 -R /broad/1KG/reference/human_b36_both.fasta -T DepthOfCoverage -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam";
|
||||||
static HashMap<String, String> expectations = new HashMap<String, String>();
|
static HashMap<String, String> expectations = new HashMap<String, String>();
|
||||||
static {
|
static {
|
||||||
expectations.put("-minMAPQ 1", "59c6071105a598e19f460640c35768c6");
|
expectations.put("-minMAPQ 1", "8b73fad5cce4620907d5da2a985219d5");
|
||||||
expectations.put("-minMAPQ 100", "e997fb5d61eaec21518722b0de90af20");
|
expectations.put("-minMAPQ 100", "1a959892d8ad0523dac2fb097eacb3c2");
|
||||||
expectations.put("-minDepth 8", "3e50afef0e751119cd27c324bdfae544");
|
expectations.put("-minDepth 8", "6e8c6b6d78962d110c87ad905fa5b664");
|
||||||
expectations.put("-minDepth 10", "d4c336d9e748347e1082bbc92d2489a3");
|
expectations.put("-minDepth 10", "14399e1237866540af3f1aee149030d0");
|
||||||
expectations.put("-bySample", "160ffa185dbfa8b0d2dc57f60f5b1e48");
|
expectations.put("-bySample", "93358437153b4d65bdff747e33de1d63");
|
||||||
expectations.put("-byRG", "dd3b4d040df7325dad4760ac6fa5252d");
|
expectations.put("-byRG", "777e8427eb4bdad300b23800cb7b0592");
|
||||||
expectations.put("-minMAPQ 1 -bySample -byRG", "bd2a07ef548b86e82ac6cce534225612");
|
expectations.put("-histogram", "96f15e1d9d598d48191e20ee84715d46");
|
||||||
|
expectations.put("-minMAPQ 1 -bySample -byRG -minDepth 8 -histogram", "783b0bc83c54d883efa8383a379ff17b");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
@ -41,7 +42,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
"-T DepthOfCoverage -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam -L 1:10,001,890-10,001,895 -o %s",
|
"-T DepthOfCoverage -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam -L 1:10,001,890-10,001,895 -o %s",
|
||||||
1, // just one output file
|
1, // just one output file
|
||||||
Arrays.asList("51203ba5ab928449cd01363af0b91510"));
|
Arrays.asList("a332d1539b29dff615b198818a3d4dd1"));
|
||||||
executeTest("testDepthOfCoverage454", spec);
|
executeTest("testDepthOfCoverage454", spec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue