From 01cf5cc74172acf0f7cef96ad45c22d8059bf99c Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 3 Dec 2009 17:01:53 +0000 Subject: [PATCH] 1. Merged CoverageHistogram into DepthOfCoverageWalker 2. Fixed bug in histogram calculation for small intervals 3. Better output in DoCWalker 4. Comments added to code git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2245 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/DepthOfCoverageWalker.java | 102 +++++++++++++- .../gatk/walkers/CoverageHistogram.java | 126 ------------------ .../DepthOfCoverageIntegrationTest.java | 17 +-- 3 files changed, 110 insertions(+), 135 deletions(-) delete mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageWalker.java index 4b3ad9ce1..cb1ab1cf0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageWalker.java @@ -60,20 +60,37 @@ public class DepthOfCoverageWalker extends LocusWalker readGroupNames = new TreeSet(); private TreeSet sampleNames = new TreeSet(); + // keep track of the histogram data + private ExpandingArrayList coverageHist = null; + private int maxDepth = 0; + private int totalLoci = 0; + + // we want to see reads with deletions public boolean includeReadsWithDeletionAtLoci() { return true; } public void initialize() { + // initialize histogram array + if ( printHistogram ) { + coverageHist = new ExpandingArrayList(); + } + + // initialize read group names from BAM header if ( byReadGroup ) { List readGroups = this.getToolkit().getSAMFileHeader().getReadGroups(); for ( SAMReadGroupRecord record : readGroups ) readGroupNames.add(record.getReadGroupId()); } + // initialize sample names from BAM header if ( bySample ) { List readGroups = this.getToolkit().getSAMFileHeader().getReadGroups(); for ( SAMReadGroupRecord record : readGroups ) { @@ -83,6 +100,8 @@ public class DepthOfCoverageWalker extends LocusWalker 0 ) { header.append("\tcoverage_atleast_MQ"); @@ -105,6 +124,8 @@ public class DepthOfCoverageWalker extends LocusWalker 0 ) info.numBadMQReads = nBadMAPQReads; + // if we need to print the histogram, fill in the data + if ( printHistogram ) + incCov(info.totalCoverage); + printDoCInfo(context.getLocation(), info, false); return info; @@ -153,6 +178,9 @@ public class DepthOfCoverageWalker extends LocusWalker> results) { - StringBuilder header = new StringBuilder("\nlocation\ttotal_coverage\taverage_coverage\tcoverage_without_deletions\taverage_coverage_without_deletions"); + // build and print the per-interval header + out.println("\n\nPER_INTERVAL_COVERAGE_SECTION"); + + StringBuilder header = new StringBuilder("location\ttotal_coverage\taverage_coverage\tcoverage_without_deletions\taverage_coverage_without_deletions"); if ( excludeMAPQBelowThis > 0 ) { header.append("\tcoverage_atleast_MQ"); header.append(excludeMAPQBelowThis); @@ -203,8 +234,77 @@ public class DepthOfCoverageWalker extends LocusWalker result : results ) printDoCInfo(result.first, result.second, true); + + // if we need to print the histogram, do so now + if ( printHistogram ) + printHisto(); + } + + private void incCov(int depth) { + int c = coverageHist.expandingGet(depth, 0); + coverageHist.set(depth, c + 1); + if ( depth > maxDepth ) + maxDepth = depth; + totalLoci++; + } + + private int getCov(int depth) { + return coverageHist.get(depth); + } + + private void printHisto() { + + // sanity check + if ( totalLoci == 0 ) + return; + + // Code for calculting std devs adapted from Michael Melgar's python script + + // Find the maximum extent of 'good' data + // First, find the mode + long maxValue = getCov(1); // ignore doc=0 + int mode = 1; + for (int i = 2; i <= maxDepth; i++) { + if ( getCov(i) > maxValue ) { + maxValue = getCov(i); + mode = i; + } + } + + // now, procede to find end of good Gaussian fit + long dist = (long)Math.pow(10, 9); + while ( Math.abs(getCov(mode) - getCov(1)) < dist && mode < maxDepth ) + dist = Math.abs(getCov(mode++) - getCov(1)); + int maxGoodDepth = Math.min(mode + 1, maxDepth); + + // calculate the mean of the good region + long totalGoodSites = 0, totalGoodDepth = 0; + for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 + totalGoodSites += getCov(i); + totalGoodDepth += i * getCov(i); + } + double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites; + + // calculate the variance and standard deviation of the good region + double var = 0.0; + for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 + var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2); + } + double stdev = Math.sqrt(var / (double)totalGoodSites); + + // print + out.println("\n\nHISTOGRAM_SECTION"); + out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev); + + for (int i = 1; i <= 5; i++) + out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev)); + + out.println("\ndepth count freq(percent)"); + for (int i = 0; i <= maxDepth; i++) + out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)totalLoci); } private void printDoCInfo(GenomeLoc loc, DoCInfo info, boolean printAverageCoverage) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java deleted file mode 100644 index 0d9a3d2c7..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java +++ /dev/null @@ -1,126 +0,0 @@ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.utils.ExpandingArrayList; - -import java.util.*; - -// Plot a histogram of depth of coverage -// j.maguire 6-11-2009 - -@By(DataSource.REFERENCE) -public class CoverageHistogram extends LocusWalker -{ - //@Argument(fullName="start", shortName="start", required=false, doc="start") public Integer START = 0; - - // Private state. - - //long[] coverage_hist; - ExpandingArrayList coverage_hist = new ExpandingArrayList(); - int max_depth = 0; - - long sum_coverage = 0; - long num_sites = 0; - - ///////// - // Walker Interface Functions - public void initialize() - { - //coverage_hist = new long[1000000]; - //Arrays.fill(coverage_hist, 0); - } - - private void incCov(int depth) { - int c = coverage_hist.expandingGet(depth, 0); - coverage_hist.set(depth, c + 1); - } - - private int getCov(int depth) { - return coverage_hist.get(depth); - } - - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - if (ref.getBase() == 'N') { return null; } - int depth = context.getReads().size(); - incCov(depth); - if (depth > max_depth) { max_depth = depth; } - - sum_coverage += depth; - num_sites += 1; - - return 0; - } - - public void onTraversalDone(Integer sum) - { - double mean_coverage = (double)sum_coverage / (double)num_sites; - double mean_good_coverage = (double)sum_coverage / ((double)(num_sites - getCov(0))); - out.printf("# all_sites : mean:%f num_sites:%d%n", mean_coverage, num_sites); - out.printf("# sites with at least 1 read : mean:%f num_sites:%d%n", mean_good_coverage, num_sites - getCov(0)); - - // Code for calculting std devs adapted from Michael Melgar's python script - - // Find the maximum extent of 'good' data - // First, find the mode - long maxValue = getCov(1); // ignore doc=0 - int mode = 1; - for (int i = 2; i <= max_depth; i++) { - if (getCov(i) > maxValue) { - maxValue = getCov(i); - mode = i; - } - } - // now, procede to find end of good Gaussian fit - long dist = (long)Math.pow(10, 9); - while ( Math.abs(getCov(mode) - getCov(1)) < dist ) - dist = Math.abs(getCov(mode++) - getCov(1)); - int maxGoodDepth = mode + 1; - - // calculate the mean of the good region - long totalGoodSites = 0, totalGoodDepth = 0; - for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 - totalGoodSites += getCov(i); - totalGoodDepth += i * getCov(i); - } - double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites; - - // calculate the variance and standard deviation of the good region - double var = 0.0; - for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 - var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2); - } - double stdev = Math.sqrt(var / (double)totalGoodSites); - out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev); - - for (int i = 1; i <= 5; i++) - out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev)); - - out.println("\ndepth count freq(percent)"); - for (int i = 0; i <= max_depth; i++) - { - out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)num_sites); - } - - } - - public Integer reduceInit() - { - return 0; - } - - public Integer reduce(Integer record, Integer sum) - { - return 0; - } - - // END Walker Interface Functions - ///////// -} diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageIntegrationTest.java index e56772722..5c7b4edde 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageIntegrationTest.java @@ -13,13 +13,14 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { private static String root = "-L 1:10,164,500-10,164,520 -R /broad/1KG/reference/human_b36_both.fasta -T DepthOfCoverage -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam"; static HashMap expectations = new HashMap(); static { - expectations.put("-minMAPQ 1", "59c6071105a598e19f460640c35768c6"); - expectations.put("-minMAPQ 100", "e997fb5d61eaec21518722b0de90af20"); - expectations.put("-minDepth 8", "3e50afef0e751119cd27c324bdfae544"); - expectations.put("-minDepth 10", "d4c336d9e748347e1082bbc92d2489a3"); - expectations.put("-bySample", "160ffa185dbfa8b0d2dc57f60f5b1e48"); - expectations.put("-byRG", "dd3b4d040df7325dad4760ac6fa5252d"); - expectations.put("-minMAPQ 1 -bySample -byRG", "bd2a07ef548b86e82ac6cce534225612"); + expectations.put("-minMAPQ 1", "8b73fad5cce4620907d5da2a985219d5"); + expectations.put("-minMAPQ 100", "1a959892d8ad0523dac2fb097eacb3c2"); + expectations.put("-minDepth 8", "6e8c6b6d78962d110c87ad905fa5b664"); + expectations.put("-minDepth 10", "14399e1237866540af3f1aee149030d0"); + expectations.put("-bySample", "93358437153b4d65bdff747e33de1d63"); + expectations.put("-byRG", "777e8427eb4bdad300b23800cb7b0592"); + expectations.put("-histogram", "96f15e1d9d598d48191e20ee84715d46"); + expectations.put("-minMAPQ 1 -bySample -byRG -minDepth 8 -histogram", "783b0bc83c54d883efa8383a379ff17b"); } @Test @@ -41,7 +42,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T DepthOfCoverage -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam -L 1:10,001,890-10,001,895 -o %s", 1, // just one output file - Arrays.asList("51203ba5ab928449cd01363af0b91510")); + Arrays.asList("a332d1539b29dff615b198818a3d4dd1")); executeTest("testDepthOfCoverage454", spec); } } \ No newline at end of file