Summaries generated for firehose from DoC output have been migrated to its own walker to calculate aggregate coverage statistics in a parallelizable and fast way. This is an initial commit, bug-fixing and testing is upcoming.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2876 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
491b30e8de
commit
173956927b
|
|
@ -1,369 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Feb 18, 2010
|
||||
*/
|
||||
|
||||
class AnalyzeDepthCLP extends CommandLineProgram {
|
||||
@Argument(fullName = "depthOfCoverageFile", shortName = "df", doc = "The Depth of Coverage output file", required = true)
|
||||
public File docFile = null;
|
||||
@Argument(fullName = "summaryFile", shortName = "sf", doc = "The summary file to which to output", required = true)
|
||||
public File summaryFile = null;
|
||||
@Argument(fullName = "plotBaseName", shortName = "bn", doc = "The base name for the plot files (e.g. 'foo' yields plots 'foo_DoC_by_sample.pdf'). Please ensure this name contains no spaces.", required = false)
|
||||
public String plotBaseName = "DepthAnalysis";
|
||||
@Argument(fullName = "pathToRScript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
private String PATH_TO_RESOURCES = "./";
|
||||
|
||||
private boolean containsByLocus = false;
|
||||
private boolean containsByTarget = false;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
// CONSTANT VALUES: SUMMARY STRING FOR NO INFORMATION, R-SCRIPT ARGUMENTS, ETC
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private final String DEFAULT_SUMMARY_STRING = "No Summary Information";
|
||||
private final String PER_LOCUS_R_ARGUMENTS = "PlotInterleavedRows depth_of_coverage\\;proportion_of_bases_above\\;Per_Sample_Depth_of_Coverage\\;"+plotBaseName+"_per_locus";
|
||||
private final String PER_TARGET_R_ARGUMENTS = "PlotInterleavedRows depth_of_coverage\\;proportion_of_targets_with_mean_coverage_above\\;Per_Sample_Average_DoC_Over_Targets\\;"+plotBaseName+"_per_target";
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
// ANALYSIS START: CALCULATE STATISTICS, WRITE IN R-READABLE FORMAT, MAKE PLOTS
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
protected int execute() {
|
||||
List<DepthStatisticsCalculator> depthStats = calculateDepthStatistics(docFile);
|
||||
String perLocusSummary = DEFAULT_SUMMARY_STRING;
|
||||
String perTargetSummary = DEFAULT_SUMMARY_STRING;
|
||||
|
||||
if ( containsByLocus ) {
|
||||
File baseSummaryTable = writeBaseSummaryFile(depthStats);
|
||||
perLocusSummary = generatePerLocusSummary(baseSummaryTable,depthStats);
|
||||
}
|
||||
|
||||
if ( containsByTarget ) {
|
||||
File targetSummaryTable = writeTargetSumamryFile(depthStats);
|
||||
perTargetSummary = generatePerTargetSummary(targetSummaryTable, depthStats);
|
||||
}
|
||||
|
||||
writeSummaryInfoFile(summaryFile,perLocusSummary,perTargetSummary);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
// OPEN AND WRITE FINAL SUMMARY DOC FILE
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private void writeSummaryInfoFile(File sFile, String locusSummary, String targetSummary) {
|
||||
PrintWriter writer;
|
||||
try {
|
||||
writer = new PrintWriter(sFile);
|
||||
writer.printf("%s%n","##Depth of coverage summary file");
|
||||
writer.printf("%s%n","##Well_Covered_Samples_By_Base - % of samples with >80% bases covered to 10x");
|
||||
writer.printf("%s%n","##Well_Covered_Samples_By_Mean - % of samples with mean coverage > 10x");
|
||||
writer.printf("%s%n%n","##Well_Covered_Samples_By_Target - % of samples with >80% targets covered to 10x");
|
||||
if ( containsByLocus )
|
||||
writer.printf("%s%n",locusSummary);
|
||||
if ( containsByTarget )
|
||||
writer.printf("%s",targetSummary);
|
||||
writer.close();
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Error writing final depth of coverage summary file",e);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
// CALL R-SCRIPTS AND GENERATE OVERALL SUMMARY FILES
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private String generatePerLocusSummary(File rReadablePlotFile, List<DepthStatisticsCalculator> calcs) {
|
||||
String rCommand = PATH_TO_RSCRIPT+" "+PATH_TO_RESOURCES+" "+rReadablePlotFile.getAbsolutePath()+" "+PER_LOCUS_R_ARGUMENTS;
|
||||
try {
|
||||
Process p = Runtime.getRuntime().exec(rCommand);
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Error executing r command for per locus plot generation",e);
|
||||
}
|
||||
|
||||
StringBuilder summary = new StringBuilder();
|
||||
summary.append(String.format("%s%n","PER_LOCUS_SUMMARY"));
|
||||
int numSamples = calcs.size()-2;
|
||||
int numGoodSamples = 0;
|
||||
int numGoodSamplesByMeanCvg = 0;
|
||||
double totalAvgCoverage = -1;
|
||||
double totalStdevCoverage = -1;
|
||||
|
||||
for ( DepthStatisticsCalculator calc : calcs ) {
|
||||
if ( calc.getName().equalsIgnoreCase("total_coverage")) {
|
||||
totalAvgCoverage = calc.getMean();
|
||||
totalStdevCoverage = Math.sqrt(calc.getVar());
|
||||
} else if ( ! calc.getName().equalsIgnoreCase("coverage_without_deletions") ) {
|
||||
if ( calc.getPercentWellCoveredLoci() > 0.8 ) {
|
||||
numGoodSamples++;
|
||||
}
|
||||
|
||||
if ( calc.getMean() > 10 ) {
|
||||
numGoodSamplesByMeanCvg++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
summary.append(String.format("%s\t%f%n","Average_Coverage:",totalAvgCoverage));
|
||||
summary.append(String.format("%s\t%f%n","Stdev_Coverage:",totalStdevCoverage));
|
||||
summary.append(String.format("%s\t%.2f%n","%Well_Covered_Samples_By_Base", ( (double) numGoodSamples*100 )/( (double) numSamples)));
|
||||
summary.append(String.format("%s\t%.2f%n","%Well_Covered_Samples_By_Mean", ( (double) numGoodSamplesByMeanCvg*100) / ( (double) numSamples )));
|
||||
|
||||
return summary.toString();
|
||||
}
|
||||
|
||||
private String generatePerTargetSummary(File rReadablePlotFile, List<DepthStatisticsCalculator> calcs) {
|
||||
String rCommand = PATH_TO_RSCRIPT+" "+PATH_TO_RESOURCES+" "+rReadablePlotFile.getAbsolutePath()+" "+PER_TARGET_R_ARGUMENTS;
|
||||
try {
|
||||
Process p = Runtime.getRuntime().exec(rCommand);
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Error executing r command for per locus plot generation",e);
|
||||
}
|
||||
|
||||
StringBuilder summary = new StringBuilder();
|
||||
summary.append(String.format("%s%n","PER_TARGET_SUMMARY"));
|
||||
int numSamples = calcs.size()-2;
|
||||
int numGoodSamples = 0;
|
||||
|
||||
for ( DepthStatisticsCalculator calc : calcs ) {
|
||||
if ( calc.getName().equalsIgnoreCase("total_coverage")) {
|
||||
// do nothing
|
||||
} else if ( ! calc.getName().equalsIgnoreCase("coverage_without_deletions") ) {
|
||||
if ( calc.getPercentWellCoveredTargets() > 0.8 ) {
|
||||
numGoodSamples++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
summary.append(String.format("%s\t%.2f%n","%Well_Covered_Samples_By_Target", ( (double) numGoodSamples*100) / ( (double) numSamples )));
|
||||
|
||||
return summary.toString();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
// R-READABLE TEMPORARY FILE CREATION
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private File writeBaseSummaryFile(List<DepthStatisticsCalculator> calcs) {
|
||||
File perLocusSummaryFile;
|
||||
|
||||
try {
|
||||
perLocusSummaryFile = File.createTempFile(plotBaseName+"_per_locus_summary",".txt");
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Could not create a temporary file. Please check the permissions of the directory you are running in, and that the base name is not a filepath.",e);
|
||||
}
|
||||
|
||||
PrintWriter locusWriter;
|
||||
|
||||
try {
|
||||
locusWriter = new PrintWriter(perLocusSummaryFile);
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Locus summary temporary file was created but could not be opened.",e);
|
||||
}
|
||||
|
||||
for ( DepthStatisticsCalculator calc : calcs ) {
|
||||
if ( ! calc.getName().equalsIgnoreCase("total_coverage") && ! calc.getName().equalsIgnoreCase("coverage_without_deletions") ) {
|
||||
locusWriter.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f",calc.getName(),calc.getLocusProportions());
|
||||
locusWriter.printf("%s\t%d\t%d\t%d\t%d\t%d\t%d",calc.getName(),calc.getEvalPoints());
|
||||
}
|
||||
}
|
||||
|
||||
locusWriter.close();
|
||||
return perLocusSummaryFile;
|
||||
}
|
||||
|
||||
private File writeTargetSumamryFile(List<DepthStatisticsCalculator> calcs) {
|
||||
File perTargetSummaryFile;
|
||||
|
||||
try {
|
||||
perTargetSummaryFile = File.createTempFile(plotBaseName+"_per_target_summary",".txt");
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Could not create a temporary file. Please check the permissions of the directory you are running in, and that the base name is not a filepath.",e);
|
||||
}
|
||||
|
||||
PrintWriter targetWriter;
|
||||
|
||||
try {
|
||||
targetWriter = new PrintWriter(perTargetSummaryFile);
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Target summary temporary file was created but could not be opened.",e);
|
||||
}
|
||||
|
||||
for ( DepthStatisticsCalculator calc : calcs ) {
|
||||
if ( ! calc.getName().equalsIgnoreCase("total_coverage") && ! calc.getName().equalsIgnoreCase("coverage_without_deletions") ) {
|
||||
targetWriter.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f",calc.getName(),calc.getTargetProportions());
|
||||
targetWriter.printf("%s\t%d\t%d\t%d\t%d\t%d\t%d",calc.getName(),calc.getEvalPoints());
|
||||
}
|
||||
}
|
||||
|
||||
targetWriter.close();
|
||||
return perTargetSummaryFile;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
// READING THE DEPTH OF COVERAGE FILE INTO CALCULATOR OBJECTS
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private List<DepthStatisticsCalculator> calculateDepthStatistics(File docFile) {
|
||||
BufferedReader docReader;
|
||||
|
||||
try {
|
||||
docReader = new BufferedReader( new FileReader(docFile) );
|
||||
} catch ( IOException e) {
|
||||
throw new StingException("The file "+docFile.getAbsolutePath()+" could not be opened...",e);
|
||||
}
|
||||
|
||||
String locusHeader = getDOCSectionHeader(docReader); // this will read to the first section header
|
||||
List<DepthStatisticsCalculator> docCalculators;
|
||||
if ( locusHeader != null && locusHeader.equalsIgnoreCase("PER_LOCUS_COVERAGE_SECTION")) {
|
||||
containsByLocus = true;
|
||||
docCalculators = instantiateDOCCalculators(docReader);
|
||||
updateLocusInfo(docCalculators,docReader);
|
||||
String targetHeader = getDOCSectionHeader(docReader);
|
||||
if ( targetHeader != null && targetHeader.equalsIgnoreCase("PER_TARGET_COVERAGE_SECTION") ) {
|
||||
containsByTarget = true;
|
||||
updateTargetInfo(docCalculators,docReader);
|
||||
} else {
|
||||
containsByTarget = false;
|
||||
}
|
||||
} else if ( locusHeader != null && locusHeader.equalsIgnoreCase("PER_TARGET_COVERAGE_SECTION") ) {
|
||||
containsByTarget = true;
|
||||
containsByLocus = false;
|
||||
docCalculators = instantiateDOCCalculators(docReader);
|
||||
updateTargetInfo(docCalculators,docReader);
|
||||
} else {
|
||||
containsByLocus = false;
|
||||
containsByTarget = false;
|
||||
docCalculators = null;
|
||||
}
|
||||
|
||||
return docCalculators;
|
||||
}
|
||||
|
||||
private List<DepthStatisticsCalculator> instantiateDOCCalculators(BufferedReader reader) {
|
||||
String header;
|
||||
try {
|
||||
header = reader.readLine();
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Unable to read the section header",e);
|
||||
}
|
||||
|
||||
List<DepthStatisticsCalculator> calcs = new ArrayList<DepthStatisticsCalculator>();
|
||||
|
||||
int offset = -1;
|
||||
for ( String entry : header.split("\t") ) {
|
||||
if ( offset > -1 ) {
|
||||
calcs.add(new DepthStatisticsCalculator(entry));
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
|
||||
return calcs;
|
||||
}
|
||||
|
||||
private void updateLocusInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
|
||||
|
||||
String docLocLine;
|
||||
try {
|
||||
docLocLine = reader.readLine();
|
||||
while ( ! isEndOfSection(docLocLine) ) {
|
||||
int offset = -1;
|
||||
for ( String entry : docLocLine.split("\t") ) {
|
||||
if ( offset > -1 ) {
|
||||
calcs.get(offset).updateLocus(Integer.parseInt(entry));
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
} catch ( IOException e) {
|
||||
throw new StingException("Error reading locus depth of coverage information",e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void updateTargetInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
|
||||
|
||||
String docLocLine;
|
||||
try {
|
||||
docLocLine = reader.readLine();
|
||||
while ( ! isEndOfSection(docLocLine) ) {
|
||||
int offset = -1;
|
||||
int targetSize = 0;
|
||||
for ( String entry : docLocLine.split("\t") ) {
|
||||
if ( offset == -1 ) {
|
||||
targetSize = parseInterval(entry);
|
||||
} else {
|
||||
calcs.get(offset).updateTargets(targetSize,Integer.parseInt(entry));
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Error reading target depth of coverage information",e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
// FILE IO METHODS -- DEPEND ON DEPTH OF COVERAGE FILE FORMAT
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private boolean isEndOfSection( String line ) {
|
||||
// sections delimited by empty line
|
||||
return line.equalsIgnoreCase("");
|
||||
}
|
||||
|
||||
private String getDOCSectionHeader(BufferedReader reader) {
|
||||
String header;
|
||||
try {
|
||||
do {
|
||||
header = reader.readLine();
|
||||
} while ( ! isDOCSectionSeparator(header) && header != null);
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Error reading depth of coverage file",e);
|
||||
}
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
private boolean isDOCSectionSeparator( String line ) {
|
||||
return line.contains("_COVERAGE_SECTION");
|
||||
}
|
||||
|
||||
private int parseInterval(String interval) {
|
||||
String startstop = interval.split(":")[1];
|
||||
int start = Integer.parseInt(startstop.split("-")[0]);
|
||||
int stop = Integer.parseInt(startstop.split("-")[1]);
|
||||
return stop - start;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
// PROGRAM START -- THE MAIN() METHOD AND WRAPPER CLASS
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
public class AnalyzeDepthOfCoverage {
|
||||
|
||||
public static void main(String[] args) {
|
||||
AnalyzeDepthCLP depthAnalysis = new AnalyzeDepthCLP();
|
||||
CommandLineProgram.start(depthAnalysis,args);
|
||||
System.exit(0);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,132 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Feb 12, 2010
|
||||
*/
|
||||
public class DepthStatisticsCalculator extends SummaryStatisticsCalculator {
|
||||
private int numBasesAbove0x; // % of bases with ANY coverage
|
||||
private int numBasesAbove3x; // % of bases with 4x+
|
||||
private int numBasesAbove9x; // % of bases with 10x+
|
||||
private int numBasesAbove24x; // % of bases with 25x+
|
||||
private int numBasesAbove49x; // % of bases with 50x+
|
||||
private int numBasesAbove99x; // % of bases with 100x+
|
||||
|
||||
private int targetsAbove0x;
|
||||
private int targetsAbove3x;
|
||||
private int targetsAbove9x;
|
||||
private int targetsAbove24x;
|
||||
private int targetsAbove49x;
|
||||
private int targetsAbove99x;
|
||||
private int numTargets;
|
||||
|
||||
public static int[] DEPTH_CUTOFFS = {1,4,10,25,50,100};
|
||||
|
||||
public DepthStatisticsCalculator(String name) {
|
||||
super(name);
|
||||
numBasesAbove0x = 0;
|
||||
numBasesAbove3x = 0;
|
||||
numBasesAbove9x = 0;
|
||||
numBasesAbove24x = 0;
|
||||
numBasesAbove49x = 0;
|
||||
numBasesAbove99x = 0;
|
||||
targetsAbove99x = 0;
|
||||
targetsAbove49x = 0;
|
||||
targetsAbove24x = 0;
|
||||
targetsAbove9x = 0;
|
||||
targetsAbove3x = 0;
|
||||
targetsAbove0x = 0;
|
||||
numTargets = 0;
|
||||
}
|
||||
|
||||
public void updateLocus(int depth) {
|
||||
super.update(depth);
|
||||
|
||||
if ( depth > 99 ) {
|
||||
numBasesAbove99x++;
|
||||
}
|
||||
if ( depth > 49 ) {
|
||||
numBasesAbove49x++;
|
||||
}
|
||||
if ( depth > 24 ) {
|
||||
numBasesAbove24x++;
|
||||
}
|
||||
if ( depth > 9 ) {
|
||||
numBasesAbove9x++;
|
||||
}
|
||||
if ( depth > 3 ) {
|
||||
numBasesAbove3x++;
|
||||
}
|
||||
if ( depth > 0 ) {
|
||||
numBasesAbove0x++;
|
||||
}
|
||||
}
|
||||
|
||||
public void updateTargets(int targetLength, int totalCoverage) {
|
||||
double avgCvg = ( (double) totalCoverage ) / ( (double) targetLength);
|
||||
|
||||
if ( avgCvg >= 100 ) {
|
||||
targetsAbove99x ++;
|
||||
}
|
||||
if ( avgCvg >= 50 ) {
|
||||
targetsAbove49x++;
|
||||
}
|
||||
if ( avgCvg >= 25 ) {
|
||||
targetsAbove24x++;
|
||||
}
|
||||
if ( avgCvg >= 10 ) {
|
||||
targetsAbove9x++;
|
||||
}
|
||||
if ( avgCvg >= 4 ) {
|
||||
targetsAbove3x++;
|
||||
}
|
||||
if ( avgCvg >= 1 ) {
|
||||
targetsAbove0x++;
|
||||
}
|
||||
|
||||
numTargets++;
|
||||
}
|
||||
|
||||
public double[] getLocusProportions() {
|
||||
double[] proportions = new double[6];
|
||||
|
||||
proportions[0] = ( (double) numBasesAbove0x )/( (double) getSampleSize() );
|
||||
proportions[2] = ( (double) numBasesAbove9x )/( (double) getSampleSize() );
|
||||
proportions[3] = ( (double) numBasesAbove24x )/( (double) getSampleSize() );
|
||||
proportions[4] = ( (double) numBasesAbove49x )/( (double) getSampleSize() );
|
||||
proportions[5] = ( (double) numBasesAbove99x )/( (double) getSampleSize() );
|
||||
proportions[1] = ( (double) numBasesAbove3x )/( (double) getSampleSize() );
|
||||
|
||||
return proportions;
|
||||
}
|
||||
|
||||
public double[] getTargetProportions() {
|
||||
double[] proportions = new double[6];
|
||||
|
||||
proportions[0] = ( (double) targetsAbove0x )/( (double) numTargets );
|
||||
proportions[2] = ( (double) targetsAbove9x )/( (double) numTargets );
|
||||
proportions[3] = ( (double) targetsAbove24x )/( (double) numTargets );
|
||||
proportions[4] = ( (double) targetsAbove49x )/( (double) numTargets );
|
||||
proportions[5] = ( (double) targetsAbove99x )/( (double) numTargets );
|
||||
proportions[1] = ( (double) targetsAbove3x )/( (double) numTargets );
|
||||
|
||||
return proportions;
|
||||
}
|
||||
|
||||
public double getPercentWellCoveredLoci() {
|
||||
return 10*( (double) numBasesAbove9x )/( (double) getSampleSize() );
|
||||
}
|
||||
|
||||
public double getPercentWellCoveredTargets() {
|
||||
return 10*( (double) targetsAbove9x )/( (double) numTargets );
|
||||
}
|
||||
|
||||
public int[] getEvalPoints() {
|
||||
return DepthStatisticsCalculator.DEPTH_CUTOFFS;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,257 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Feb 11, 2010
|
||||
*/
|
||||
|
||||
class FirehoseSummaryCLP extends CommandLineProgram {
|
||||
@Argument(fullName = "depthOfCoverageFile", shortName = "doc", doc="Path to the depth of coverage file", required=true)
|
||||
private File depthOfCoverage = null;
|
||||
// @Argument(fullName = "contaminationFile", shortName = "con", doc="Path to the contamination file", required=true)
|
||||
// private File contamination = null;
|
||||
// @Argument(fullName = "errorRateFile", shortName = "err", doc="Path to the error rate file", required=true)
|
||||
// private File errorRate = null;
|
||||
// @Argument(fullName = "zipFiles", shortName = "zip", doc="List of paths to zip files which contain summary metrics files", required=false)
|
||||
// private String zipFiles = null;
|
||||
|
||||
private static String R_SCRIPT = "plotFirehoseDataQCMetrics.R";
|
||||
private static String SCRIPT_DOC_FLAG = "DOC";
|
||||
|
||||
protected int execute() {
|
||||
// SummaryFileCollection metricsFiles = getFileHandles();
|
||||
List<DepthStatisticsCalculator> depthStats = calculateDepthStatistics(depthOfCoverage);
|
||||
String docSummary = makeDOCPlots(depthStats);
|
||||
return 1;
|
||||
}
|
||||
|
||||
private String makeDOCPlots(List<DepthStatisticsCalculator> calcs) {
|
||||
StringBuilder summaryBuilder = new StringBuilder();
|
||||
int numSamplesWithGoodBaseCoverage=0;
|
||||
int numSamplesWithGoodMeanCoverage = 0;
|
||||
int numSamplesWithGoodTargetCoverage=0;
|
||||
double aggregateMeanCoverage = -1d;
|
||||
double aggregateStdDevCoverage = -1d;
|
||||
double aggregateSkewCoverage = -1d;
|
||||
PrintWriter locusOutput;
|
||||
PrintWriter targetOutput;
|
||||
File locusOutputFile;
|
||||
File targetOutputFile;
|
||||
try {
|
||||
locusOutputFile = File.createTempFile("locus_output","txt");
|
||||
locusOutput = new PrintWriter(locusOutputFile);
|
||||
targetOutputFile = File.createTempFile("target_output","txt");
|
||||
targetOutput = new PrintWriter(targetOutputFile);
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Error creating temporary output files for R-plotting. Perhaps user permissions do not allow creation of files.",e);
|
||||
}
|
||||
|
||||
for ( DepthStatisticsCalculator calc : calcs ) {
|
||||
|
||||
if ( calc.getName().equalsIgnoreCase("total_coverage") || calc.getName().equalsIgnoreCase("coverage_without_deletions") ) {
|
||||
// update summary statistics
|
||||
if ( calc.getName().equalsIgnoreCase("total_coverage") ) {
|
||||
aggregateMeanCoverage = calc.getMean();
|
||||
aggregateStdDevCoverage = Math.sqrt(calc.getVar());
|
||||
aggregateSkewCoverage = calc.getSkew();
|
||||
}
|
||||
} else {
|
||||
// update sample-based summary statistics and print output to file
|
||||
locusOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n",calc.getName(),calc.getLocusProportions());
|
||||
targetOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n", calc.getName(),calc.getTargetProportions());
|
||||
if ( calc.getPercentWellCoveredLoci() > 80 ) {
|
||||
numSamplesWithGoodBaseCoverage++;
|
||||
}
|
||||
if ( calc.getPercentWellCoveredTargets() > 90 ) {
|
||||
numSamplesWithGoodTargetCoverage++;
|
||||
}
|
||||
if ( calc.getMean() > 10 ) {
|
||||
numSamplesWithGoodMeanCoverage++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfBasesCoveredBySample",locusOutputFile.getAbsolutePath());
|
||||
//invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfTargetsCoveredBySample",targetOutputFile.getAbsolutePath());
|
||||
return "temporary";
|
||||
}
|
||||
|
||||
// private SummaryFileCollection getFileHandles() {
|
||||
// if ( zipFiles == null ) {
|
||||
// return null;
|
||||
// }
|
||||
//
|
||||
// SummaryFileCollection summaryFiles = new SummaryFileCollection();
|
||||
// for ( String zipFile : zipFiles.split(",") ) {
|
||||
// summaryFiles.process(zipFile);
|
||||
// }
|
||||
//
|
||||
// return summaryFiles;
|
||||
// }
|
||||
|
||||
private List<DepthStatisticsCalculator> calculateDepthStatistics(File docFile) {
|
||||
BufferedReader docReader;
|
||||
|
||||
try {
|
||||
docReader = new BufferedReader( new FileReader(docFile) );
|
||||
} catch ( IOException e) {
|
||||
throw new StingException("The file "+docFile.getAbsolutePath()+" could not be opened...",e);
|
||||
}
|
||||
|
||||
String locusHeader = getDOCSectionHeader(docReader);
|
||||
List<DepthStatisticsCalculator> docCalculators = instantiateDOCCalculators(locusHeader);
|
||||
|
||||
updateLocusInfo(docCalculators,docReader);
|
||||
|
||||
String targetHeader = getDOCSectionHeader(docReader);
|
||||
|
||||
updateTargetInfo(docCalculators,docReader);
|
||||
|
||||
return docCalculators;
|
||||
}
|
||||
|
||||
private void updateLocusInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
|
||||
|
||||
String docLocLine;
|
||||
try {
|
||||
docLocLine = reader.readLine();
|
||||
while ( ! isEndOfSection(docLocLine) ) {
|
||||
int offset = -1;
|
||||
for ( String entry : docLocLine.split("\t") ) {
|
||||
if ( offset > -1 ) {
|
||||
calcs.get(offset).updateLocus(Integer.parseInt(entry));
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
} catch ( IOException e) {
|
||||
throw new StingException("Error reading locus depth of coverage information",e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void updateTargetInfo(List<DepthStatisticsCalculator> calcs, BufferedReader reader) {
|
||||
|
||||
String docLocLine;
|
||||
try {
|
||||
docLocLine = reader.readLine();
|
||||
while ( ! isEndOfSection(docLocLine) ) {
|
||||
int offset = -1;
|
||||
int targetSize = 0;
|
||||
for ( String entry : docLocLine.split("\t") ) {
|
||||
if ( offset == -1 ) {
|
||||
targetSize = parseInterval(entry);
|
||||
} else {
|
||||
calcs.get(offset).updateTargets(targetSize,Integer.parseInt(entry));
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Error reading target depth of coverage information",e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private int parseInterval(String interval) {
|
||||
String startstop = interval.split(":")[1];
|
||||
int start = Integer.parseInt(startstop.split("-")[0]);
|
||||
int stop = Integer.parseInt(startstop.split("-")[1]);
|
||||
return stop - start;
|
||||
}
|
||||
|
||||
private boolean isDOCSectionSeparator( String line ) {
|
||||
return line.contains("_COVERAGE_SECTION");
|
||||
}
|
||||
|
||||
private boolean isEndOfSection( String line ) {
|
||||
// sections delimited by empty line
|
||||
return line.equalsIgnoreCase("");
|
||||
}
|
||||
|
||||
private String getDOCSectionHeader(BufferedReader reader) {
|
||||
String header;
|
||||
try {
|
||||
do {
|
||||
header = reader.readLine();
|
||||
} while ( ! isDOCSectionSeparator(header) );
|
||||
|
||||
header = reader.readLine();
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Error reading depth of coverage file",e);
|
||||
}
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
private List<DepthStatisticsCalculator> instantiateDOCCalculators(String header) {
|
||||
List<DepthStatisticsCalculator> calcs = new ArrayList<DepthStatisticsCalculator>();
|
||||
|
||||
int offset = -1;
|
||||
for ( String entry : header.split("\t") ) {
|
||||
if ( offset > -1 ) {
|
||||
calcs.add(new DepthStatisticsCalculator(entry));
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
|
||||
return calcs;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class GenerateFirehoseSummary {
|
||||
|
||||
public static void main(String[] args) {
|
||||
FirehoseSummaryCLP clp = new FirehoseSummaryCLP();
|
||||
CommandLineProgram.start(clp, args);
|
||||
System.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
//class SummaryFileCollection {
|
||||
//
|
||||
// // container class for files we'll be summarizing
|
||||
//
|
||||
// public Map<String,File> fingerprintSummaryFiles;
|
||||
// public Map<String,File> hybridSelectionMetricsFiles;
|
||||
// public Map<String,File> insertSizeDistributionFiles;
|
||||
// public Map<String,File> alignmentMetricsFiles;
|
||||
//
|
||||
// public SummaryFileCollection() {
|
||||
// fingerprintSummaryFiles = new HashMap<String,File>();
|
||||
// hybridSelectionMetricsFiles = new HashMap<String, File>();
|
||||
// insertSizeDistributionFiles = new HashMap<String,File>();
|
||||
// alignmentMetricsFiles = new HashMap<String,File>();
|
||||
// }
|
||||
//
|
||||
// public void process(String zipFilePath) {
|
||||
// String sampleName = zipFilePath.split("_sequencing_metrics.zip")[0].split("_")[1];
|
||||
// File fingerprintSummaryFile = new File(sampleName+".summary_fingerprint_metrics");
|
||||
// File hybridSelectionFile = new File(sampleName+".hybrid_selection_metrics");
|
||||
// File insertSizeFile = new File(sampleName+".insert_size_metrics");
|
||||
// File alignmentFile = new File(sampleName+".alignment_metrics");
|
||||
//
|
||||
// String command = "unzip "+zipFilePath;
|
||||
// try {
|
||||
// Process p = Runtime.getRuntime().exec(command);
|
||||
// } catch (IOException e) {
|
||||
// throw new RuntimeException("Could not unzip the file "+zipFilePath);
|
||||
// }
|
||||
//
|
||||
// fingerprintSummaryFiles.put(sampleName,fingerprintSummaryFile);
|
||||
// hybridSelectionMetricsFiles.put(sampleName,hybridSelectionFile);
|
||||
// insertSizeDistributionFiles.put(sampleName,insertSizeFile);
|
||||
// alignmentMetricsFiles.put(sampleName,alignmentFile);
|
||||
// }
|
||||
//}
|
||||
|
|
@ -1,69 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.firehosesummary;
|
||||
|
||||
/**
|
||||
* This object calculates the first three sample moments of a data set on-the-fly
|
||||
* @Author chartl
|
||||
* @Date Feb 12, 2010
|
||||
*/
|
||||
public class SummaryStatisticsCalculator {
|
||||
// todo -- can median / quantiles be estimated on-the-fly?
|
||||
// todo -- can we divine a metric for modality?
|
||||
private double mean; // mean of the samples fed to it
|
||||
private double var; // variance of the samples fed to it
|
||||
private double skew; // skew of the samples fed to it
|
||||
private int sampleSize; // number of samples given to calculator
|
||||
private String name; // name to associate calculator with, if any
|
||||
|
||||
public SummaryStatisticsCalculator() {
|
||||
mean = 0;
|
||||
var = 0;
|
||||
skew = 0;
|
||||
sampleSize = 0;
|
||||
name = "SUMMARY";
|
||||
}
|
||||
|
||||
public SummaryStatisticsCalculator(String name) {
|
||||
mean = 0;
|
||||
var = 0;
|
||||
skew = 0;
|
||||
sampleSize = 0;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public void update(double datum) {
|
||||
mean = (sampleSize*mean + datum)/(sampleSize + 1);
|
||||
var = (sampleSize*var + Math.pow( mean - datum , 2 ) )/( sampleSize + 1);
|
||||
// note: the variance is not re-calculated over all data points each time the mean changes
|
||||
// this means the convergence will be slower than x^-(1/2); but it will still converge with the mean
|
||||
// whole exome is ~3.3 million datapoints, which will be plenty for this to converge to within a very
|
||||
// small proportion of the true sample variance
|
||||
skew = ( skew*Math.pow(var,2/3)*sampleSize+ Math.pow( datum - mean, 3) ) / ( ( sampleSize + 1 )* Math.pow(var,2/3) );
|
||||
// again, new mean/variance estimates are not propagated over all previous data points, so convergence is a bit slower
|
||||
// but that's the price one pays for on-the-fly calculation
|
||||
sampleSize++;
|
||||
}
|
||||
|
||||
public void update(int datum) {
|
||||
update( (double) datum);
|
||||
}
|
||||
|
||||
public double getMean() {
|
||||
return mean;
|
||||
}
|
||||
|
||||
public double getVar() {
|
||||
return var;
|
||||
}
|
||||
|
||||
public double getSkew() {
|
||||
return skew;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public int getSampleSize() {
|
||||
return sampleSize;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,437 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A parallelizable walker designed to quickly aggregate relevant coverage statistics across samples in the input
|
||||
* file. Assesses the mean and median granular coverages of each sample, and generates part of a cumulative
|
||||
* distribution of % bases and % targets covered for certain depths. The granularity of DOC can be set by command
|
||||
* line arguments.
|
||||
*
|
||||
* // todo -- allow for user to set linear binning (default is logarithmic)
|
||||
* // todo -- add per target (e.g. regional) aggregation
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Feb 22, 2010
|
||||
*/
|
||||
@By(DataSource.REFERENCE)
|
||||
public class CoverageStatistics extends LocusWalker<Map<String,Integer>, DepthOfCoverageStats> implements TreeReducible<DepthOfCoverageStats> {
|
||||
@Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false)
|
||||
int start = 1;
|
||||
@Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false)
|
||||
int stop = 500;
|
||||
@Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false)
|
||||
int nBins = 20;
|
||||
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
|
||||
byte minMappingQuality = 50;
|
||||
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
|
||||
byte minBaseQuality = 20;
|
||||
@Argument(fullName = "perLocusStatisticsFile", shortName = "locusFile", doc = "File to output per-locus statistics to; if unprovided these will not be calculated")
|
||||
File perLocusStatisticsFile = null;
|
||||
@Argument(fullName = "perSampleStatisticsFile", shortName = "sampleFile", doc = "File to output per-sample statistics to; if unprovided will go to standard (-o) output")
|
||||
File perSampleStatisticsFile = null;
|
||||
@Argument(fullName = "summaryStatisticsFile", shortName = "summaryFile", doc = "File to output summary (mean, median) statistics to; if unprovided will go to standard (-o) output")
|
||||
File summaryStatisticsFile = null;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// STANDARD WALKER METHODS
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
public DepthOfCoverageStats reduceInit() {
|
||||
List<Set<String>> samplesByReaders = getToolkit().getSamplesByReaders();
|
||||
DepthOfCoverageStats stats = new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins));
|
||||
|
||||
for ( Set<String> sampleSet : samplesByReaders ) {
|
||||
for ( String sample : sampleSet ) {
|
||||
stats.addSample(sample);
|
||||
}
|
||||
}
|
||||
|
||||
if ( perLocusStatisticsFile != null ) {
|
||||
stats.initializeLocusCounts();
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
public Map<String,Integer> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
Map<String,StratifiedAlignmentContext> contextsBySample =
|
||||
StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
|
||||
HashMap<String,Integer> depthBySample = new HashMap<String,Integer>();
|
||||
|
||||
for ( String sample : contextsBySample.keySet() ) {
|
||||
AlignmentContext sampleContext = contextsBySample.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
int properDepth = 0;
|
||||
for ( PileupElement e : sampleContext.getBasePileup() ) {
|
||||
if ( e.getQual() >= minBaseQuality && e.getMappingQual() >= minMappingQuality ) {
|
||||
properDepth++;
|
||||
}
|
||||
}
|
||||
|
||||
depthBySample.put(sample,properDepth);
|
||||
}
|
||||
|
||||
return depthBySample;
|
||||
}
|
||||
|
||||
public DepthOfCoverageStats reduce(Map<String,Integer> thisMap, DepthOfCoverageStats prevReduce) {
|
||||
prevReduce.update(thisMap);
|
||||
return prevReduce;
|
||||
}
|
||||
|
||||
public DepthOfCoverageStats treeReduce(DepthOfCoverageStats left, DepthOfCoverageStats right) {
|
||||
left.merge(right);
|
||||
return left;
|
||||
}
|
||||
|
||||
public void onTraversalDone(DepthOfCoverageStats coverageProfiles) {
|
||||
printSummary(out,summaryStatisticsFile,coverageProfiles);
|
||||
printPerSample(out,perSampleStatisticsFile,coverageProfiles);
|
||||
printPerLocus(perLocusStatisticsFile,coverageProfiles);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// HELPER OUTPUT METHODS
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private void printPerSample(PrintStream out, File optionalFile, DepthOfCoverageStats stats) {
|
||||
PrintStream output = getCorrectStream(out,optionalFile);
|
||||
int[] leftEnds = stats.getEndpoints();
|
||||
StringBuilder hBuilder = new StringBuilder();
|
||||
hBuilder.append("\t");
|
||||
hBuilder.append(String.format("[0,%d)\t",leftEnds[0]));
|
||||
for ( int i = 1; i < leftEnds.length; i++ )
|
||||
hBuilder.append(String.format("[%d,%d)\t",leftEnds[i-1],leftEnds[i]));
|
||||
hBuilder.append(String.format("[%d,inf)%n",leftEnds[leftEnds.length-1]));
|
||||
output.print(hBuilder.toString());
|
||||
Map<String,int[]> histograms = stats.getHistograms();
|
||||
for ( String s : histograms.keySet() ) {
|
||||
StringBuilder sBuilder = new StringBuilder();
|
||||
sBuilder.append(String.format("%s",s));
|
||||
for ( int count : histograms.get(s) ) {
|
||||
sBuilder.append(String.format("\t%d",count));
|
||||
}
|
||||
sBuilder.append(String.format("%n"));
|
||||
output.print(sBuilder.toString());
|
||||
}
|
||||
}
|
||||
|
||||
private void printPerLocus(File locusFile, DepthOfCoverageStats stats) {
|
||||
PrintStream output = getCorrectStream(null,locusFile);
|
||||
if ( output == null ) {
|
||||
return;
|
||||
}
|
||||
|
||||
int[] endpoints = stats.getEndpoints();
|
||||
int samples = stats.getHistograms().size();
|
||||
|
||||
int[][] baseCoverageCumDist = stats.getLocusCounts();
|
||||
|
||||
// rows - # of samples
|
||||
// columns - depth of coverage
|
||||
|
||||
StringBuilder header = new StringBuilder();
|
||||
for ( int d : endpoints ) {
|
||||
header.append(String.format("\t%d",d));
|
||||
}
|
||||
header.append(String.format("%n"));
|
||||
|
||||
output.print(header);
|
||||
|
||||
for ( int row = samples; row > 0; row ++ ) {
|
||||
output.printf("%s_%d\t","NSamples",row);
|
||||
for ( int depthBin = 0; depthBin < baseCoverageCumDist[0].length; depthBin ++ ) {
|
||||
output.printf("%d\t",baseCoverageCumDist[row][depthBin]);
|
||||
}
|
||||
output.printf("%n");
|
||||
}
|
||||
}
|
||||
|
||||
private PrintStream getCorrectStream(PrintStream out, File optionalFile) {
|
||||
PrintStream output;
|
||||
if ( optionalFile == null ) {
|
||||
output = out;
|
||||
} else {
|
||||
try {
|
||||
output = new PrintStream(optionalFile);
|
||||
} catch ( IOException e ) {
|
||||
logger.warn("Error opening the output file "+optionalFile.getAbsolutePath()+". Defaulting to stdout");
|
||||
output = out;
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
private void printSummary(PrintStream out, File optionalFile, DepthOfCoverageStats stats) {
|
||||
PrintStream output = getCorrectStream(out,optionalFile);
|
||||
|
||||
output.printf("%s\t%s\t%s\t%s\t%s%n","sample_id","mean","granular_third_quartile","granular_median","granular_first_quartile");
|
||||
|
||||
Map<String,int[]> histograms = stats.getHistograms();
|
||||
Map<String,Double> means = stats.getMeans();
|
||||
int[] leftEnds = stats.getEndpoints();
|
||||
|
||||
for ( String s : histograms.keySet() ) {
|
||||
int median = getQuantile(histograms.get(s),0.5);
|
||||
int q1 = getQuantile(histograms.get(s),0.25);
|
||||
int q3 = getQuantile(histograms.get(s),0.75);
|
||||
output.printf("%s\t%.2f\t%d\t%d\t%d%n",s,means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]);
|
||||
}
|
||||
|
||||
output.printf("%s\t%.2f\t%s\t%s\t%s%n","Total",means.get(DepthOfCoverageStats.ALL_SAMPLES),"N/A","N/A","N/A");
|
||||
}
|
||||
|
||||
private int getQuantile(int[] histogram, double prop) {
|
||||
int total = 0;
|
||||
|
||||
for ( int i = 0; i < histogram.length; i ++ ) {
|
||||
total += histogram[i];
|
||||
}
|
||||
|
||||
int counts = 0;
|
||||
int bin = -1;
|
||||
while ( counts < prop*total ) {
|
||||
counts += histogram[bin+1];
|
||||
bin++;
|
||||
}
|
||||
|
||||
return bin;
|
||||
}
|
||||
}
|
||||
|
||||
class DepthOfCoverageStats {
|
||||
public static String ALL_SAMPLES = "ALL_COMBINED_SAMPLES";
|
||||
// STANDARD (constantly updated) DATA
|
||||
private Map<String,int[]> granularHistogramBySample; // holds the counts per each bin
|
||||
private Map<String,Double> meanCoverages; // holds mean coverage per sample
|
||||
private int[] binLeftEndpoints; // describes the left endpoint for each bin
|
||||
private int[][] locusCoverageCounts; // holds counts of number of bases with >=X samples at >=Y coverage
|
||||
private boolean tabulateLocusCounts = false;
|
||||
private int nLoci; // number of loci seen
|
||||
// TEMPORARY DATA (not worth re-instantiating every time)
|
||||
private int[] locusHistogram; // holds a histogram for each locus; reset after each update() call
|
||||
private int totalDepth; // holds the total depth of coverage for each locus; reset after each update() call
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// STATIC METHODS
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
public static int[] calculateBinEndpoints(int lower, int upper, int bins) {
|
||||
if ( bins > upper - lower || lower < 1 ) {
|
||||
throw new IllegalArgumentException("Illegal argument to calculateBinEndpoints; "+
|
||||
"lower bound must be at least 1, and number of bins may not exceed stop - start");
|
||||
}
|
||||
|
||||
int[] binLeftEndpoints = new int[bins+1];
|
||||
binLeftEndpoints[0] = 0; // depth < start
|
||||
|
||||
int length = upper - lower;
|
||||
double scale = Math.log10((double) length)/bins;
|
||||
|
||||
for ( int b = 1; b < bins ; b++ ) {
|
||||
int leftEnd = (int) Math.floor(Math.pow(10.0,(b-1.0)*scale));
|
||||
|
||||
while ( leftEnd <= binLeftEndpoints[b-1] ) {
|
||||
leftEnd++;
|
||||
}
|
||||
|
||||
binLeftEndpoints[b] = leftEnd;
|
||||
}
|
||||
|
||||
return binLeftEndpoints;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// INITIALIZATION METHODS
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
public DepthOfCoverageStats(int[] leftEndpoints) {
|
||||
this.binLeftEndpoints = leftEndpoints;
|
||||
granularHistogramBySample = new HashMap<String,int[]>();
|
||||
meanCoverages = new HashMap<String,Double>();
|
||||
nLoci = 0;
|
||||
totalDepth = 0;
|
||||
}
|
||||
|
||||
public void addSample(String sample) {
|
||||
if ( granularHistogramBySample.containsKey(sample) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
int[] binCounts = new int[this.binLeftEndpoints.length];
|
||||
for ( int b = 0; b < binCounts.length; b ++ ) {
|
||||
binCounts[b] = 0;
|
||||
}
|
||||
|
||||
granularHistogramBySample.put(sample,binCounts);
|
||||
meanCoverages.put(sample,0.0);
|
||||
}
|
||||
|
||||
public void initializeLocusCounts() {
|
||||
locusCoverageCounts = new int[granularHistogramBySample.size()][binLeftEndpoints.length];
|
||||
locusHistogram = new int[binLeftEndpoints.length];
|
||||
for ( int b = 0; b < binLeftEndpoints.length; b ++ ) {
|
||||
for ( int a = 0; a < granularHistogramBySample.size(); a ++ ) {
|
||||
locusCoverageCounts[a][b] = 0;
|
||||
}
|
||||
locusHistogram[b] = 0;
|
||||
}
|
||||
|
||||
tabulateLocusCounts = true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// UPDATE METHODS
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
public void update(Map<String,Integer> depthBySample) {
|
||||
int b;
|
||||
for ( String sample : granularHistogramBySample.keySet() ) {
|
||||
if ( depthBySample.containsKey(sample) ) {
|
||||
b = updateSample(sample,depthBySample.get(sample));
|
||||
totalDepth += depthBySample.get(sample);
|
||||
} else {
|
||||
b = updateSample(sample,0);
|
||||
}
|
||||
|
||||
if ( tabulateLocusCounts ) {
|
||||
for ( int i = 0; i <= b; i ++ ) {
|
||||
locusHistogram[i]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double meanDepth = meanCoverages.get(DepthOfCoverageStats.ALL_SAMPLES);
|
||||
double newMean = ( meanDepth*nLoci + (double) totalDepth )/( nLoci + 1 );
|
||||
meanCoverages.put(DepthOfCoverageStats.ALL_SAMPLES,newMean);
|
||||
updateLocusCounts(locusHistogram);
|
||||
|
||||
nLoci++;
|
||||
totalDepth = 0;
|
||||
}
|
||||
|
||||
private int updateSample(String sample, int depth) {
|
||||
double mean = meanCoverages.get(sample);
|
||||
double newMean = ( nLoci*mean + (double) depth )/(nLoci + 1.0);
|
||||
meanCoverages.put(sample,newMean);
|
||||
|
||||
int[] granularBins = granularHistogramBySample.get(sample);
|
||||
for ( int b = 1; b < granularBins.length; b ++ ) {
|
||||
if ( depth < binLeftEndpoints[b] ) {
|
||||
granularBins[b-1]++;
|
||||
return b ;
|
||||
}
|
||||
}
|
||||
|
||||
granularBins[granularBins.length-1]++; // greater than all left-endpoints
|
||||
return granularBins.length-1;
|
||||
}
|
||||
|
||||
public void merge(DepthOfCoverageStats newStats) {
|
||||
this.mergeSamples(newStats);
|
||||
if ( this.tabulateLocusCounts && newStats.tabulateLocusCounts ) {
|
||||
this.mergeLocusCounts(newStats.getLocusCounts());
|
||||
}
|
||||
|
||||
double totalMean = (meanCoverages.get(DepthOfCoverageStats.ALL_SAMPLES)*nLoci +
|
||||
newStats.getMeans().get(DepthOfCoverageStats.ALL_SAMPLES)*newStats.getTotalLoci()) /
|
||||
( nLoci + newStats.getTotalLoci());
|
||||
|
||||
meanCoverages.put(DepthOfCoverageStats.ALL_SAMPLES,totalMean);
|
||||
nLoci += newStats.getTotalLoci();
|
||||
}
|
||||
|
||||
private void mergeSamples(DepthOfCoverageStats otherStats) {
|
||||
Map<String,int[]> otherHistogram = otherStats.getHistograms();
|
||||
Map<String,Double> otherMeans = otherStats.getMeans();
|
||||
for ( String s : granularHistogramBySample.keySet() ) {
|
||||
int[] internalCounts = granularHistogramBySample.get(s);
|
||||
int[] externalCounts = otherHistogram.get(s);
|
||||
for ( int b = 0; b < internalCounts.length; b++ ) {
|
||||
internalCounts[b] += externalCounts[b];
|
||||
}
|
||||
|
||||
double internalMean = meanCoverages.get(s);
|
||||
double externalMean = otherMeans.get(s);
|
||||
double newMean = ( internalMean*nLoci + externalMean*otherStats.getTotalLoci())/(nLoci+otherStats.getTotalLoci());
|
||||
|
||||
meanCoverages.put(s,newMean);
|
||||
}
|
||||
}
|
||||
|
||||
private void mergeLocusCounts( int[][] otherCounts ) {
|
||||
for ( int a = 0; a < locusCoverageCounts.length; a ++ ) {
|
||||
for ( int b = 0; b < locusCoverageCounts[0].length; b ++ ) {
|
||||
locusCoverageCounts[a][b] += otherCounts[a][b];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Update locus counts -- takes an array in which the number of samples
|
||||
* with depth ABOVE [i] is held. So if the bin left endpoints were 2, 5, 10
|
||||
* then we'd have an array that represented:
|
||||
* [# samples with depth 0 - inf], [# samples with depth 2 - inf],
|
||||
* [# samples with depth 5 - inf], [# samples with depth 10-inf];
|
||||
*
|
||||
* this is
|
||||
* @argument cumulativeSamplesByDepthBin - see above
|
||||
*/
|
||||
private void updateLocusCounts(int[] cumulativeSamplesByDepthBin) {
|
||||
if ( tabulateLocusCounts ) {
|
||||
for ( int bin = 0; bin < cumulativeSamplesByDepthBin.length; bin ++ ) {
|
||||
int numSamples = cumulativeSamplesByDepthBin[bin];
|
||||
for ( int i = 0; i < numSamples; i ++ ) {
|
||||
locusCoverageCounts[i][bin]++;
|
||||
}
|
||||
|
||||
cumulativeSamplesByDepthBin[bin] = 0; // reset counts in advance of next update()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// ACCESSOR METHODS
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
public Map<String,int[]> getHistograms() {
|
||||
return granularHistogramBySample;
|
||||
}
|
||||
|
||||
public int[][] getLocusCounts() {
|
||||
return locusCoverageCounts;
|
||||
}
|
||||
|
||||
public int[] getEndpoints() {
|
||||
return binLeftEndpoints;
|
||||
}
|
||||
|
||||
public Map<String,Double> getMeans() {
|
||||
return meanCoverages;
|
||||
}
|
||||
|
||||
public int getTotalLoci() {
|
||||
return nLoci;
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue