gatk-3.8/archive/java/src/org/broadinstitute/sting/oldepthofcoverage/DepthOfCoverageWalker.java

426 lines
16 KiB
Java
Executable File

/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import net.sf.samtools.SAMReadGroupRecord;
import java.util.*;
/**
* Computes the depth of coverage at all loci in the specified region of the reference. Includes features to control
* grouping of results by read group or by sample, or filtering loci with either a small number of overlapping alignments
* or with alignments of poor mapping quality. Can optionally include individual base counts at each locus.
*/
@By(DataSource.REFERENCE)
public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoCInfo, DepthOfCoverageWalker.DoCInfo> {
@Argument(fullName="suppressLocusPrinting", shortName= "noLocus", doc="Suppress printing", required=false)
public boolean suppressLocusPrinting = false;
@Argument(fullName="suppressIntervalPrinting", shortName= "noInterval", doc="Suppress printing", required=false)
public boolean suppressIntervalPrinting = false;
@Argument(fullName="printBaseCounts", shortName ="bases", doc="Print individual base counts (A,C,G,T only)", required=false)
protected boolean printBaseCounts = false;
@Argument(fullName="minMAPQ", shortName ="minMAPQ", doc="If provided, we will also list read counts with MAPQ >= this value at a locus in coverage",required=false)
protected int excludeMAPQBelowThis = -1;
@Argument(fullName = "minBaseQualityScore", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public Integer minBaseQ = -1;
@Argument(fullName="minDepth", shortName ="minDepth", doc="If provided, we will also list the percentage of loci with depth >= this value per interval",required=false)
protected int minDepthForPercentage = -1;
@Argument(fullName="byReadGroup", shortName="byRG", doc="List read depths for each read group")
protected boolean byReadGroup = false;
@Argument(fullName="bySample", shortName="bySample", doc="List read depths for each sample")
protected boolean bySample = false;
@Argument(fullName="printHistogram", shortName="histogram", doc="Print a histogram of the coverage")
protected boolean printHistogram = false;
// keep track of the read group and sample names
private TreeSet<String> readGroupNames = new TreeSet<String>();
private TreeSet<String> sampleNames = new TreeSet<String>();
// keep track of the histogram data
private ExpandingArrayList<Long> coverageHist = null;
private long maxDepth = 0;
private long totalLoci = 0;
// we want to see reads with deletions
public boolean includeReadsWithDeletionAtLoci() { return true; }
public void initialize() {
// initialize histogram array
if ( printHistogram ) {
coverageHist = new ExpandingArrayList<Long>();
}
// initialize read group names from BAM header
if ( byReadGroup ) {
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
for ( SAMReadGroupRecord record : readGroups )
readGroupNames.add(record.getReadGroupId());
}
// initialize sample names from BAM header
if ( bySample ) {
List<SAMReadGroupRecord> readGroups = this.getToolkit().getSAMFileHeader().getReadGroups();
for ( SAMReadGroupRecord record : readGroups ) {
String sample = record.getSample();
if ( sample != null )
sampleNames.add(sample);
}
}
// build and print the per-locus header
if ( !suppressLocusPrinting ) {
out.println("\nPER_LOCUS_COVERAGE_SECTION");
printHeaderLine(false);
}
}
public DoCInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
// fill in and print all of the per-locus coverage data, then return it to reduce
ReadBackedPileup pileup = context.getPileup().getBaseFilteredPileup(minBaseQ);
DoCInfo info = new DoCInfo();
info.totalCoverage = pileup.size();
long nBadMAPQReads = 0, nDeletionReads = 0;
for ( PileupElement p : pileup ) {
if ( excludeMAPQBelowThis > 0 && p.getRead().getMappingQuality() < excludeMAPQBelowThis )
nBadMAPQReads++;
else if ( p.isDeletion() )
nDeletionReads++;
if ( printBaseCounts ) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(p.getBase());
if ( baseIndex != -1 )
info.baseCounts[baseIndex]++;
}
SAMReadGroupRecord readGroup = p.getRead().getReadGroup();
if ( readGroup == null )
continue;
if ( byReadGroup ) {
String readGroupName = readGroup.getReadGroupId();
long oldDepth = info.depthByReadGroup.get(readGroupName);
info.depthByReadGroup.put(readGroupName, oldDepth + 1);
}
if ( bySample ) {
String sample = readGroup.getSample();
if ( sample != null ) {
long oldDepth = info.depthBySample.get(sample);
info.depthBySample.put(sample, oldDepth + 1);
}
}
}
info.numDeletions = nDeletionReads;
if ( excludeMAPQBelowThis > 0 )
info.numBadMQReads = nBadMAPQReads;
// if we need to print the histogram, fill in the data
if ( printHistogram )
incCov(info.totalCoverage);
if ( !suppressLocusPrinting )
printDoCInfo(context.getLocation(), info, false);
return info;
}
public boolean isReduceByInterval() {
return true;
}
public DoCInfo reduceInit() { return new DoCInfo(); }
public DoCInfo reduce(DoCInfo value, DoCInfo sum) {
// combine all of the per-locus data for a given interval
sum.totalCoverage += value.totalCoverage;
sum.numDeletions += value.numDeletions;
sum.numBadMQReads += value.numBadMQReads;
if ( value.totalCoverage >= minDepthForPercentage ) {
sum.minDepthCoveredLoci++;
}
if ( printBaseCounts ) {
for (int baseIndex = 0; baseIndex < BaseUtils.BASES.length; baseIndex++ )
sum.baseCounts[baseIndex] += value.baseCounts[baseIndex];
}
if ( byReadGroup ) {
for ( String rg : readGroupNames ) {
long oldDepth = sum.depthByReadGroup.get(rg);
sum.depthByReadGroup.put(rg, oldDepth + value.depthByReadGroup.get(rg));
}
}
if ( bySample ) {
for ( String sample : sampleNames ) {
long oldDepth = sum.depthBySample.get(sample);
sum.depthBySample.put(sample, oldDepth + value.depthBySample.get(sample));
}
}
return sum;
}
@Override
public void onTraversalDone(List<Pair<GenomeLoc, DoCInfo>> results) {
// build and print the per-interval header
if ( ! suppressIntervalPrinting ) {
out.println("\n\nPER_INTERVAL_COVERAGE_SECTION");
printHeaderLine(true);
// print all of the individual per-interval coverage data
for ( Pair<GenomeLoc, DoCInfo> result : results )
printDoCInfo(result.first, result.second, true);
}
// if we need to print the histogram, do so now
if ( printHistogram )
printHisto();
}
private void printHeaderLine(boolean printAverageCoverage) {
StringBuilder header = new StringBuilder("location\ttotal_coverage");
if ( printAverageCoverage )
header.append("\taverage_coverage");
header.append("\tcoverage_without_deletions");
if ( printAverageCoverage )
header.append("\taverage_coverage_without_deletions");
if ( excludeMAPQBelowThis > 0 ) {
header.append("\tcoverage_atleast_MQ");
header.append(excludeMAPQBelowThis);
if ( printAverageCoverage ) {
header.append("\taverage_coverage_atleast_MQ");
header.append(excludeMAPQBelowThis);
}
}
if ( printAverageCoverage && minDepthForPercentage >= 0 ) {
header.append("\tpercent_loci_covered_atleast_depth");
header.append(minDepthForPercentage);
}
if ( printBaseCounts ) {
header.append("\tA_count\tC_count\tG_count\tT_count");
}
if ( byReadGroup ) {
for ( String rg : readGroupNames ) {
header.append("\tcoverage_for_");
header.append(rg);
}
}
if ( bySample ) {
for ( String sample : sampleNames ) {
header.append("\tcoverage_for_");
header.append(sample);
}
}
out.println(header.toString());
}
private void incCov(long depth) {
long c = coverageHist.expandingGet((int)depth, 0L);
coverageHist.set((int)depth, c + 1);
if ( depth > maxDepth )
maxDepth = depth;
totalLoci++;
}
private long getCov(long depth) {
return coverageHist.get((int)depth);
}
private void printHisto() {
// sanity check
if ( totalLoci == 0 )
return;
// Code for calculting std devs adapted from Michael Melgar's python script
// Find the maximum extent of 'good' data
// First, find the mode
long maxValue = getCov(1); // ignore doc=0
int mode = 1;
for (int i = 2; i <= maxDepth; i++) {
if ( getCov(i) > maxValue ) {
maxValue = getCov(i);
mode = i;
}
}
// now, procede to find end of good Gaussian fit
long dist = (long)Math.pow(10, 9);
while ( Math.abs(getCov(mode) - getCov(1)) < dist && mode < maxDepth )
dist = Math.abs(getCov(mode++) - getCov(1));
long maxGoodDepth = Math.min(mode + 1, maxDepth);
// calculate the mean of the good region
long totalGoodSites = 0, totalGoodDepth = 0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
totalGoodSites += getCov(i);
totalGoodDepth += i * getCov(i);
}
double meanGoodDepth = (double)totalGoodDepth / (double)totalGoodSites;
// calculate the variance and standard deviation of the good region
double var = 0.0;
for (int i = 1; i <= maxGoodDepth; i++) { // ignore doc=0
var += getCov(i) * Math.pow(meanGoodDepth - (double)i, 2);
}
double stdev = Math.sqrt(var / (double)totalGoodSites);
// print
out.println("\n\nHISTOGRAM_SECTION");
out.printf("# sites within Gaussian fit : mean:%f num_sites:%d std_dev:%f%n", meanGoodDepth, totalGoodSites, stdev);
for (int i = 1; i <= 5; i++)
out.printf("# Gaussian mean + %d Std Dev : %f%n", i, (meanGoodDepth + i*stdev));
out.println("\ndepth count freq(percent)");
for (int i = 0; i <= maxDepth; i++)
out.printf("%d %d %f\n", i, getCov(i), (100.0*getCov(i)) / (double)totalLoci);
}
private void printDoCInfo(GenomeLoc loc, DoCInfo info, boolean printAverageCoverage) {
double totalBases = (double)(loc.getStop() - loc.getStart() + 1);
StringBuilder sb = new StringBuilder();
sb.append(loc);
sb.append("\t");
sb.append(info.totalCoverage);
sb.append("\t");
if ( printAverageCoverage ) {
sb.append(String.format("%.2f", ((double)info.totalCoverage) / totalBases));
sb.append("\t");
}
sb.append((info.totalCoverage - info.numDeletions));
if ( printAverageCoverage ) {
sb.append("\t");
sb.append(String.format("%.2f", ((double)(info.totalCoverage - info.numDeletions)) / totalBases));
}
if ( excludeMAPQBelowThis > 0 ) {
sb.append("\t");
sb.append((info.totalCoverage - info.numBadMQReads));
if ( printAverageCoverage ) {
sb.append("\t");
sb.append(String.format("%.2f", ((double)(info.totalCoverage - info.numBadMQReads)) / totalBases));
}
}
if ( printAverageCoverage && minDepthForPercentage >= 0 ) {
sb.append("\t");
sb.append(String.format("%.2f", ((double)info.minDepthCoveredLoci) / totalBases));
}
if ( printBaseCounts ) {
for (int baseIndex = 0; baseIndex < BaseUtils.BASES.length; baseIndex++ ) {
sb.append("\t");
sb.append(String.format("%8d", info.baseCounts[baseIndex]));
}
}
if ( byReadGroup ) {
for ( String rg : readGroupNames ) {
sb.append("\t");
sb.append(String.format("%8d", info.depthByReadGroup.get(rg)));
}
}
if ( bySample ) {
for ( String sample : sampleNames ) {
sb.append("\t");
sb.append(String.format("%8d", info.depthBySample.get(sample)));
}
}
out.println(sb.toString());
}
public class DoCInfo {
public long totalCoverage = 0;
public long numDeletions = 0;
public long numBadMQReads = 0;
public long minDepthCoveredLoci = 0;
public long[] baseCounts = null;
public HashMap<String, Long> depthByReadGroup = null;
public HashMap<String, Long> depthBySample = null;
public DoCInfo() {
if ( printBaseCounts ) {
baseCounts = new long[4];
}
if ( byReadGroup ) {
depthByReadGroup = new HashMap<String, Long>();
for ( String readGroupName : readGroupNames )
depthByReadGroup.put(readGroupName, 0L);
}
if ( bySample ) {
depthBySample = new HashMap<String, Long>();
for ( String sample : sampleNames )
depthBySample.put(sample, 0L);
}
}
@Override
public String toString() {
// This is an executive summary, included mainly so that integration tests will pass.
// TODO: Add a more compelling summary.
return String.format("Summary: total coverage = %d; # of deletions = %d; # of bad mapping quality reads = %d; minimum covered depth =%d",
totalCoverage,
numDeletions,
numBadMQReads,
minDepthCoveredLoci);
}
}
}