gatk-3.8/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java

155 lines
6.6 KiB
Java

/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.sam;
import java.io.File;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.metrics.MetricsFile;
import edu.mit.broad.picard.util.Histogram;
import edu.mit.broad.picard.util.Log;
import edu.mit.broad.picard.util.RExecutor;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.util.CloseableIterator;
/**
* Command line program to read non-duplicate insert sizes, create a histogram
* and report distribution statistics.
*
* @author Doug Voet
*/
public class CollectInsertSizeMetrics extends CommandLineProgram {
private static Log log = Log.getInstance(CollectInsertSizeMetrics.class);
private static final String HISTOGRAM_R_SCRIPT = "edu/mit/broad/picard/sam/insertSizeHistogram.R";
// Usage and parameters
@Usage(programVersion="1.0")
public String USAGE = "Reads a SAM or BAM file and writes a file containing metrics about " +
"the statistical distribution of insert size (excluding duplicates) " +
"and generates a histogram plot.\n";
@Option(shortName="I", doc="SAM or BAM file") public File INPUT;
@Option(shortName="O", doc="File to write insert size metrics to") public File OUTPUT;
@Option(shortName="H", doc="File to write insert size histogram chart to") public File HISTOGRAM_FILE;
/** Required main method implementation. */
public static void main(String[] argv) {
System.exit(new CollectInsertSizeMetrics().instanceMain(argv));
}
@Override
protected int doWork() {
IoUtil.assertFileIsReadable(INPUT);
IoUtil.assertFileIsWritable(OUTPUT);
IoUtil.assertFileIsWritable(HISTOGRAM_FILE);
SAMFileReader in = new SAMFileReader(INPUT);
MetricsFile<InsertSizeMetrics, Integer> file = collectMetrics(in.iterator());
in.close();
file.write(OUTPUT);
if (file.getMetrics().get(0).READ_PAIRS == 0) {
log.warn("Input file did not contain any records with insert size information.");
} else {
int rResult = RExecutor.executeFromClasspath(
HISTOGRAM_R_SCRIPT,
OUTPUT.getAbsolutePath(),
HISTOGRAM_FILE.getAbsolutePath(),
INPUT.getName());
if (rResult != 0) {
throw new PicardException("R script " + HISTOGRAM_R_SCRIPT + " failed with return code " + rResult);
}
}
return 0;
}
/**
* Does all the work of iterating through the sam file and collecting insert size metrics.
*/
MetricsFile<InsertSizeMetrics, Integer> collectMetrics(CloseableIterator<SAMRecord> samIterator) {
Histogram<Integer> insertSizeHistogram = new Histogram<Integer>("insert_size", "count");
while (samIterator.hasNext()) {
SAMRecord record = samIterator.next();
if (skipRecord(record)) {
continue;
}
int insertSize = Math.abs(record.getInferredInsertSize());
insertSizeHistogram.increment(insertSize);
}
MetricsFile<InsertSizeMetrics, Integer> file = new MetricsFile<InsertSizeMetrics, Integer>();
file.setHistogram(insertSizeHistogram);
InsertSizeMetrics metrics = new InsertSizeMetrics();
metrics.READ_PAIRS = (long) insertSizeHistogram.getCount();
metrics.MAX_INSERT_SIZE = (int) insertSizeHistogram.getMax();
metrics.MIN_INSERT_SIZE = (int) insertSizeHistogram.getMin();
metrics.MEAN_INSERT_SIZE = insertSizeHistogram.getMean();
metrics.STANDARD_DEVIATION = insertSizeHistogram.getStandardDeviation();
metrics.MEDIAN_INSERT_SIZE = insertSizeHistogram.getMedian();
final double total = insertSizeHistogram.getCount();
final double median = insertSizeHistogram.getMedian();
double covered = 0;
double low = median;
double high = median;
while (low >= insertSizeHistogram.getMin() || high <= insertSizeHistogram.getMax()) {
Histogram<Integer>.Bin lowBin = insertSizeHistogram.get((int) low);
if (lowBin != null) covered += lowBin.getValue();
if (low != high) {
Histogram<Integer>.Bin highBin = insertSizeHistogram.get((int) high);
if (highBin != null) covered += highBin.getValue();
}
double percentCovered = covered / total;
int distance = (int) (high - low) + 1;
if (percentCovered >= 0.1 && metrics.WIDTH_OF_10_PERCENT == 0) metrics.WIDTH_OF_10_PERCENT = distance;
if (percentCovered >= 0.2 && metrics.WIDTH_OF_20_PERCENT == 0) metrics.WIDTH_OF_20_PERCENT = distance;
if (percentCovered >= 0.3 && metrics.WIDTH_OF_30_PERCENT == 0) metrics.WIDTH_OF_30_PERCENT = distance;
if (percentCovered >= 0.4 && metrics.WIDTH_OF_40_PERCENT == 0) metrics.WIDTH_OF_40_PERCENT = distance;
if (percentCovered >= 0.5 && metrics.WIDTH_OF_50_PERCENT == 0) metrics.WIDTH_OF_50_PERCENT = distance;
if (percentCovered >= 0.6 && metrics.WIDTH_OF_60_PERCENT == 0) metrics.WIDTH_OF_60_PERCENT = distance;
if (percentCovered >= 0.7 && metrics.WIDTH_OF_70_PERCENT == 0) metrics.WIDTH_OF_70_PERCENT = distance;
if (percentCovered >= 0.8 && metrics.WIDTH_OF_80_PERCENT == 0) metrics.WIDTH_OF_80_PERCENT = distance;
if (percentCovered >= 0.9 && metrics.WIDTH_OF_90_PERCENT == 0) metrics.WIDTH_OF_90_PERCENT = distance;
if (percentCovered >= 0.99 && metrics.WIDTH_OF_99_PERCENT == 0) metrics.WIDTH_OF_99_PERCENT = distance;
--low;
++high;
}
file.addMetric(metrics);
return file;
}
/**
* Figures out whether or not the record should be included in the counting of insert sizes
*/
private boolean skipRecord(SAMRecord record) {
return !record.getReadPairedFlag() ||
record.getMateUnmappedFlag() ||
record.getFirstOfPairFlag() ||
record.getNotPrimaryAlignmentFlag() ||
record.getDuplicateReadFlag() ||
record.getInferredInsertSize() == 0;
}
}