/* * Copyright (c) 2009 The Broad Institute * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.walkers.coverage; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.pileup.*; import net.sf.samtools.SAMReadGroupRecord; import java.util.*; /** * Computes the depth of coverage at all loci in the specified region of the reference. Includes features to control * grouping of results by read group or by sample, or filtering loci with either a small number of overlapping alignments * or with alignments of poor mapping quality. Can optionally include individual base counts at each locus. */ @By(DataSource.REFERENCE) public class DepthOfCoverageWalker extends LocusWalker { @Argument(fullName="suppressLocusPrinting", shortName= "noLocus", doc="Suppress printing", required=false) public boolean suppressLocusPrinting = false; @Argument(fullName="suppressIntervalPrinting", shortName= "noInterval", doc="Suppress printing", required=false) public boolean suppressIntervalPrinting = false; @Argument(fullName="printBaseCounts", shortName ="bases", doc="Print individual base counts (A,C,G,T only)", required=false) protected boolean printBaseCounts = false; @Argument(fullName="minMAPQ", shortName ="minMAPQ", doc="If provided, we will also list read counts with MAPQ >= this value at a locus in coverage",required=false) protected int excludeMAPQBelowThis = -1; @Argument(fullName = "minBaseQualityScore", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) public Integer minBaseQ = -1; @Argument(fullName="minDepth", shortName ="minDepth", doc="If provided, we will also list the percentage of loci with depth >= this value per interval",required=false) protected int minDepthForPercentage = -1; @Argument(fullName="byReadGroup", shortName="byRG", doc="List read depths for each read group") protected boolean byReadGroup = false; @Argument(fullName="bySample", shortName="bySample", doc="List read depths for each sample") protected boolean bySample = false; @Argument(fullName="printHistogram", shortName="histogram", doc="Print a histogram of the coverage") protected boolean printHistogram = false; // keep track of the read group and sample names private TreeSet readGroupNames = new TreeSet(); private TreeSet sampleNames = new TreeSet(); // keep track of the histogram data private ExpandingArrayList coverageHist = null; private long maxDepth = 0; private long totalLoci = 0; // we want to see reads with deletions public boolean includeReadsWithDeletionAtLoci() { return true; } public void initialize() { // initialize histogram array if ( printHistogram ) { coverageHist = new ExpandingArrayList(); } // initialize read group names from BAM header if ( byReadGroup ) { List readGroups = this.getToolkit().getSAMFileHeader().getReadGroups(); for ( SAMReadGroupRecord record : readGroups ) readGroupNames.add(record.getReadGroupId()); } // initialize sample names from BAM header if ( bySample ) { List readGroups = this.getToolkit().getSAMFileHeader().getReadGroups(); for ( SAMReadGroupRecord record : readGroups ) { String sample = record.getSample(); if ( sample != null ) sampleNames.add(sample); } } // build and print the per-locus header if ( !suppressLocusPrinting ) { out.println("\nPER_LOCUS_COVERAGE_SECTION"); printHeaderLine(false); } } public DoCInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { // fill in and print all of the per-locus coverage data, then return it to reduce ReadBackedPileup pileup = context.getPileup().getBaseFilteredPileup(minBaseQ); DoCInfo info = new DoCInfo(); info.totalCoverage = pileup.size(); long nBadMAPQReads = 0, nDeletionReads = 0; for ( PileupElement p : pileup ) { if ( excludeMAPQBelowThis > 0 && p.getRead().getMappingQuality() < excludeMAPQBelowThis ) nBadMAPQReads++; else if ( p.isDeletion() ) nDeletionReads++; if ( printBaseCounts ) { int baseIndex = BaseUtils.simpleBaseToBaseIndex(p.getBase()); if ( baseIndex != -1 ) info.baseCounts[baseIndex]++; } SAMReadGroupRecord readGroup = p.getRead().getReadGroup(); if ( readGroup == null ) continue; if ( byReadGroup ) { String readGroupName = readGroup.getReadGroupId(); long oldDepth = info.depthByReadGroup.get(readGroupName); info.depthByReadGroup.put(readGroupName, oldDepth + 1); } if ( bySample ) { String sample = readGroup.getSample(); if ( sample != null ) { long oldDepth = info.depthBySample.get(sample); info.depthBySample.put(sample, oldDepth + 1); } } } info.numDeletions = nDeletionReads; if ( excludeMAPQBelowThis > 0 ) info.numBadMQReads = nBadMAPQReads; // if we need to print the histogram, fill in the data if ( printHistogram ) incCov(info.totalCoverage); if ( !suppressLocusPrinting ) printDoCInfo(context.getLocation(), info, false); return info; } public boolean isReduceByInterval() { return true; } public DoCInfo reduceInit() { return new DoCInfo(); } public DoCInfo reduce(DoCInfo value, DoCInfo sum) { // combine all of the per-locus data for a given interval sum.totalCoverage += value.totalCoverage; sum.numDeletions += value.numDeletions; sum.numBadMQReads += value.numBadMQReads; if ( value.totalCoverage >= minDepthForPercentage ) { sum.minDepthCoveredLoci++; } if ( printBaseCounts ) { for (int baseIndex = 0; baseIndex < BaseUtils.BASES.length; baseIndex++ ) sum.baseCounts[baseIndex] += value.baseCounts[baseIndex]; } if ( byReadGroup ) { for ( String rg : readGroupNames ) { long oldDepth = sum.depthByReadGroup.get(rg); sum.depthByReadGroup.put(rg, oldDepth + value.depthByReadGroup.get(rg)); } } if ( bySample ) { for ( String sample : sampleNames ) { long oldDepth = sum.depthBySample.get(sample); sum.depthBySample.put(sample, oldDepth + value.depthBySample.get(sample)); } } return sum; } @Override public void onTraversalDone(List> results) { // build and print the per-interval header if ( ! suppressIntervalPrinting ) { out.println("\n\nPER_INTERVAL_COVERAGE_SECTION"); printHeaderLine(true); // print all of the individual per-interval coverage data for ( Pair result : results ) printDoCInfo(result.first, result.second, true); } // if we need to print the histogram, do so now if ( printHistogram ) printHisto(); } private void printHeaderLine(boolean printAverageCoverage) { StringBuilder header = new StringBuilder("location\ttotal_coverage"); if ( printAverageCoverage ) header.append("\taverage_coverage"); header.append("\tcoverage_without_deletions"); if ( printAverageCoverage ) header.append("\taverage_coverage_without_deletions"); if ( excludeMAPQBelowThis > 0 ) { header.append("\tcoverage_atleast_MQ"); header.append(excludeMAPQBelowThis); if ( printAverageCoverage ) { header.append("\taverage_coverage_atleast_MQ"); header.append(excludeMAPQBelowThis); } } if ( printAverageCoverage && minDepthForPercentage >= 0 ) { header.append("\tpercent_loci_covered_atleast_depth"); header.append(minDepthForPercentage); } if ( printBaseCounts ) { header.append("\tA_count\tC_count\tG_count\tT_count"); } if ( byReadGroup ) { for ( String rg : readGroupNames ) { header.append("\tcoverage_for_"); header.append(rg); } } if ( bySample ) { for ( String sample : sampleNames ) { header.append("\tcoverage_for_"); header.append(sample); } } out.println(header.toString()); } private void incCov(long depth) { long c = coverageHist.expandingGet((int)depth, 0L); coverageHist.set((int)depth, c + 1); if ( depth > maxDepth ) maxDepth = depth; totalLoci++; } private long getCov(long depth) { return coverageHist.get((int)depth); } private void printHisto() { // sanity check if ( totalLoci == 0 ) return; // Code for calculting std devs adapted from Michael Melgar's python script // Find the maximum extent of 'good' data // First, find the mode long maxValue = getCov(1); // ignore doc=0 int mode = 1; for (int i = 2; i <= maxDepth; i++) { if ( getCov(i) > maxValue ) { maxValue = getCov(i); mode = i; } } // now, procede to find end of good Gaussian fit long dist = (long)Math.pow(10, 9); while ( Math.abs(getCov(mode) - getCov(1)) < dist && mode < maxDepth ) dist = Math.abs(getCov(mode++) - getCov(1)); long maxGoodDepth = Math.min(mode + 1, maxDepth); // calculate the mean of the good region long totalGoodSites = 0, totalGoodDepth = 0; for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 totalGoodSites += getCov(i); totalGoodDepth += i * getCov(i); } double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites; // calculate the variance and standard deviation of the good region double var = 0.0; for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0 var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2); } double stdev = Math.sqrt(var / (double)totalGoodSites); // print out.println("\n\nHISTOGRAM_SECTION"); out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev); for (int i = 1; i <= 5; i++) out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev)); out.println("\ndepth count freq(percent)"); for (int i = 0; i <= maxDepth; i++) out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)totalLoci); } private void printDoCInfo(GenomeLoc loc, DoCInfo info, boolean printAverageCoverage) { double totalBases = (double)(loc.getStop() - loc.getStart() + 1); StringBuilder sb = new StringBuilder(); sb.append(loc); sb.append("\t"); sb.append(info.totalCoverage); sb.append("\t"); if ( printAverageCoverage ) { sb.append(String.format("%.2f", ((double)info.totalCoverage) / totalBases)); sb.append("\t"); } sb.append((info.totalCoverage - info.numDeletions)); if ( printAverageCoverage ) { sb.append("\t"); sb.append(String.format("%.2f", ((double)(info.totalCoverage - info.numDeletions)) / totalBases)); } if ( excludeMAPQBelowThis > 0 ) { sb.append("\t"); sb.append((info.totalCoverage - info.numBadMQReads)); if ( printAverageCoverage ) { sb.append("\t"); sb.append(String.format("%.2f", ((double)(info.totalCoverage - info.numBadMQReads)) / totalBases)); } } if ( printAverageCoverage && minDepthForPercentage >= 0 ) { sb.append("\t"); sb.append(String.format("%.2f", ((double)info.minDepthCoveredLoci) / totalBases)); } if ( printBaseCounts ) { for (int baseIndex = 0; baseIndex < BaseUtils.BASES.length; baseIndex++ ) { sb.append("\t"); sb.append(String.format("%8d", info.baseCounts[baseIndex])); } } if ( byReadGroup ) { for ( String rg : readGroupNames ) { sb.append("\t"); sb.append(String.format("%8d", info.depthByReadGroup.get(rg))); } } if ( bySample ) { for ( String sample : sampleNames ) { sb.append("\t"); sb.append(String.format("%8d", info.depthBySample.get(sample))); } } out.println(sb.toString()); } public class DoCInfo { public long totalCoverage = 0; public long numDeletions = 0; public long numBadMQReads = 0; public long minDepthCoveredLoci = 0; public long[] baseCounts = null; public HashMap depthByReadGroup = null; public HashMap depthBySample = null; public DoCInfo() { if ( printBaseCounts ) { baseCounts = new long[4]; } if ( byReadGroup ) { depthByReadGroup = new HashMap(); for ( String readGroupName : readGroupNames ) depthByReadGroup.put(readGroupName, 0L); } if ( bySample ) { depthBySample = new HashMap(); for ( String sample : sampleNames ) depthBySample.put(sample, 0L); } } @Override public String toString() { // This is an executive summary, included mainly so that integration tests will pass. // TODO: Add a more compelling summary. return String.format("Summary: total coverage = %d; # of deletions = %d; # of bad mapping quality reads = %d; minimum covered depth =%d", totalCoverage, numDeletions, numBadMQReads, minDepthCoveredLoci); } } }