diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/AnalyzeDepthOfCoverage.java b/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/AnalyzeDepthOfCoverage.java deleted file mode 100644 index aadcad894..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/AnalyzeDepthOfCoverage.java +++ /dev/null @@ -1,369 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.firehosesummary; - -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.*; -import java.util.ArrayList; -import java.util.List; - -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date Feb 18, 2010 - */ - -class AnalyzeDepthCLP extends CommandLineProgram { - @Argument(fullName = "depthOfCoverageFile", shortName = "df", doc = "The Depth of Coverage output file", required = true) - public File docFile = null; - @Argument(fullName = "summaryFile", shortName = "sf", doc = "The summary file to which to output", required = true) - public File summaryFile = null; - @Argument(fullName = "plotBaseName", shortName = "bn", doc = "The base name for the plot files (e.g. 'foo' yields plots 'foo_DoC_by_sample.pdf'). Please ensure this name contains no spaces.", required = false) - public String plotBaseName = "DepthAnalysis"; - @Argument(fullName = "pathToRScript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) - private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; - @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) - private String PATH_TO_RESOURCES = "./"; - - private boolean containsByLocus = false; - private boolean containsByTarget = false; - - /////////////////////////////////////////////////////////////////////////////////// - // CONSTANT VALUES: SUMMARY STRING FOR NO INFORMATION, R-SCRIPT ARGUMENTS, ETC - /////////////////////////////////////////////////////////////////////////////////// - - private final String DEFAULT_SUMMARY_STRING = "No Summary Information"; - private final String PER_LOCUS_R_ARGUMENTS = "PlotInterleavedRows depth_of_coverage\\;proportion_of_bases_above\\;Per_Sample_Depth_of_Coverage\\;"+plotBaseName+"_per_locus"; - private final String PER_TARGET_R_ARGUMENTS = "PlotInterleavedRows depth_of_coverage\\;proportion_of_targets_with_mean_coverage_above\\;Per_Sample_Average_DoC_Over_Targets\\;"+plotBaseName+"_per_target"; - - /////////////////////////////////////////////////////////////////////////////////// - // ANALYSIS START: CALCULATE STATISTICS, WRITE IN R-READABLE FORMAT, MAKE PLOTS - /////////////////////////////////////////////////////////////////////////////////// - - protected int execute() { - List depthStats = calculateDepthStatistics(docFile); - String perLocusSummary = DEFAULT_SUMMARY_STRING; - String perTargetSummary = DEFAULT_SUMMARY_STRING; - - if ( containsByLocus ) { - File baseSummaryTable = writeBaseSummaryFile(depthStats); - perLocusSummary = generatePerLocusSummary(baseSummaryTable,depthStats); - } - - if ( containsByTarget ) { - File targetSummaryTable = writeTargetSumamryFile(depthStats); - perTargetSummary = generatePerTargetSummary(targetSummaryTable, depthStats); - } - - writeSummaryInfoFile(summaryFile,perLocusSummary,perTargetSummary); - - return 1; - } - - /////////////////////////////////////////////////////////////////////////////////// - // OPEN AND WRITE FINAL SUMMARY DOC FILE - /////////////////////////////////////////////////////////////////////////////////// - - private void writeSummaryInfoFile(File sFile, String locusSummary, String targetSummary) { - PrintWriter writer; - try { - writer = new PrintWriter(sFile); - writer.printf("%s%n","##Depth of coverage summary file"); - writer.printf("%s%n","##Well_Covered_Samples_By_Base - % of samples with >80% bases covered to 10x"); - writer.printf("%s%n","##Well_Covered_Samples_By_Mean - % of samples with mean coverage > 10x"); - writer.printf("%s%n%n","##Well_Covered_Samples_By_Target - % of samples with >80% targets covered to 10x"); - if ( containsByLocus ) - writer.printf("%s%n",locusSummary); - if ( containsByTarget ) - writer.printf("%s",targetSummary); - writer.close(); - } catch (IOException e) { - throw new StingException("Error writing final depth of coverage summary file",e); - } - } - - /////////////////////////////////////////////////////////////////////////////////// - // CALL R-SCRIPTS AND GENERATE OVERALL SUMMARY FILES - /////////////////////////////////////////////////////////////////////////////////// - - private String generatePerLocusSummary(File rReadablePlotFile, List calcs) { - String rCommand = PATH_TO_RSCRIPT+" "+PATH_TO_RESOURCES+" "+rReadablePlotFile.getAbsolutePath()+" "+PER_LOCUS_R_ARGUMENTS; - try { - Process p = Runtime.getRuntime().exec(rCommand); - } catch ( IOException e ) { - throw new StingException("Error executing r command for per locus plot generation",e); - } - - StringBuilder summary = new StringBuilder(); - summary.append(String.format("%s%n","PER_LOCUS_SUMMARY")); - int numSamples = calcs.size()-2; - int numGoodSamples = 0; - int numGoodSamplesByMeanCvg = 0; - double totalAvgCoverage = -1; - double totalStdevCoverage = -1; - - for ( DepthStatisticsCalculator calc : calcs ) { - if ( calc.getName().equalsIgnoreCase("total_coverage")) { - totalAvgCoverage = calc.getMean(); - totalStdevCoverage = Math.sqrt(calc.getVar()); - } else if ( ! calc.getName().equalsIgnoreCase("coverage_without_deletions") ) { - if ( calc.getPercentWellCoveredLoci() > 0.8 ) { - numGoodSamples++; - } - - if ( calc.getMean() > 10 ) { - numGoodSamplesByMeanCvg++; - } - } - } - - summary.append(String.format("%s\t%f%n","Average_Coverage:",totalAvgCoverage)); - summary.append(String.format("%s\t%f%n","Stdev_Coverage:",totalStdevCoverage)); - summary.append(String.format("%s\t%.2f%n","%Well_Covered_Samples_By_Base", ( (double) numGoodSamples*100 )/( (double) numSamples))); - summary.append(String.format("%s\t%.2f%n","%Well_Covered_Samples_By_Mean", ( (double) numGoodSamplesByMeanCvg*100) / ( (double) numSamples ))); - - return summary.toString(); - } - - private String generatePerTargetSummary(File rReadablePlotFile, List calcs) { - String rCommand = PATH_TO_RSCRIPT+" "+PATH_TO_RESOURCES+" "+rReadablePlotFile.getAbsolutePath()+" "+PER_TARGET_R_ARGUMENTS; - try { - Process p = Runtime.getRuntime().exec(rCommand); - } catch ( IOException e ) { - throw new StingException("Error executing r command for per locus plot generation",e); - } - - StringBuilder summary = new StringBuilder(); - summary.append(String.format("%s%n","PER_TARGET_SUMMARY")); - int numSamples = calcs.size()-2; - int numGoodSamples = 0; - - for ( DepthStatisticsCalculator calc : calcs ) { - if ( calc.getName().equalsIgnoreCase("total_coverage")) { - // do nothing - } else if ( ! calc.getName().equalsIgnoreCase("coverage_without_deletions") ) { - if ( calc.getPercentWellCoveredTargets() > 0.8 ) { - numGoodSamples++; - } - } - } - - summary.append(String.format("%s\t%.2f%n","%Well_Covered_Samples_By_Target", ( (double) numGoodSamples*100) / ( (double) numSamples ))); - - return summary.toString(); - } - - /////////////////////////////////////////////////////////////////////////////////// - // R-READABLE TEMPORARY FILE CREATION - /////////////////////////////////////////////////////////////////////////////////// - - private File writeBaseSummaryFile(List calcs) { - File perLocusSummaryFile; - - try { - perLocusSummaryFile = File.createTempFile(plotBaseName+"_per_locus_summary",".txt"); - } catch ( IOException e ) { - throw new StingException("Could not create a temporary file. Please check the permissions of the directory you are running in, and that the base name is not a filepath.",e); - } - - PrintWriter locusWriter; - - try { - locusWriter = new PrintWriter(perLocusSummaryFile); - } catch ( IOException e ) { - throw new StingException("Locus summary temporary file was created but could not be opened.",e); - } - - for ( DepthStatisticsCalculator calc : calcs ) { - if ( ! calc.getName().equalsIgnoreCase("total_coverage") && ! calc.getName().equalsIgnoreCase("coverage_without_deletions") ) { - locusWriter.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f",calc.getName(),calc.getLocusProportions()); - locusWriter.printf("%s\t%d\t%d\t%d\t%d\t%d\t%d",calc.getName(),calc.getEvalPoints()); - } - } - - locusWriter.close(); - return perLocusSummaryFile; - } - - private File writeTargetSumamryFile(List calcs) { - File perTargetSummaryFile; - - try { - perTargetSummaryFile = File.createTempFile(plotBaseName+"_per_target_summary",".txt"); - } catch ( IOException e ) { - throw new StingException("Could not create a temporary file. Please check the permissions of the directory you are running in, and that the base name is not a filepath.",e); - } - - PrintWriter targetWriter; - - try { - targetWriter = new PrintWriter(perTargetSummaryFile); - } catch ( IOException e ) { - throw new StingException("Target summary temporary file was created but could not be opened.",e); - } - - for ( DepthStatisticsCalculator calc : calcs ) { - if ( ! calc.getName().equalsIgnoreCase("total_coverage") && ! calc.getName().equalsIgnoreCase("coverage_without_deletions") ) { - targetWriter.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f",calc.getName(),calc.getTargetProportions()); - targetWriter.printf("%s\t%d\t%d\t%d\t%d\t%d\t%d",calc.getName(),calc.getEvalPoints()); - } - } - - targetWriter.close(); - return perTargetSummaryFile; - } - - /////////////////////////////////////////////////////////////////////////////////// - // READING THE DEPTH OF COVERAGE FILE INTO CALCULATOR OBJECTS - /////////////////////////////////////////////////////////////////////////////////// - - private List calculateDepthStatistics(File docFile) { - BufferedReader docReader; - - try { - docReader = new BufferedReader( new FileReader(docFile) ); - } catch ( IOException e) { - throw new StingException("The file "+docFile.getAbsolutePath()+" could not be opened...",e); - } - - String locusHeader = getDOCSectionHeader(docReader); // this will read to the first section header - List docCalculators; - if ( locusHeader != null && locusHeader.equalsIgnoreCase("PER_LOCUS_COVERAGE_SECTION")) { - containsByLocus = true; - docCalculators = instantiateDOCCalculators(docReader); - updateLocusInfo(docCalculators,docReader); - String targetHeader = getDOCSectionHeader(docReader); - if ( targetHeader != null && targetHeader.equalsIgnoreCase("PER_TARGET_COVERAGE_SECTION") ) { - containsByTarget = true; - updateTargetInfo(docCalculators,docReader); - } else { - containsByTarget = false; - } - } else if ( locusHeader != null && locusHeader.equalsIgnoreCase("PER_TARGET_COVERAGE_SECTION") ) { - containsByTarget = true; - containsByLocus = false; - docCalculators = instantiateDOCCalculators(docReader); - updateTargetInfo(docCalculators,docReader); - } else { - containsByLocus = false; - containsByTarget = false; - docCalculators = null; - } - - return docCalculators; - } - - private List instantiateDOCCalculators(BufferedReader reader) { - String header; - try { - header = reader.readLine(); - } catch (IOException e) { - throw new StingException("Unable to read the section header",e); - } - - List calcs = new ArrayList(); - - int offset = -1; - for ( String entry : header.split("\t") ) { - if ( offset > -1 ) { - calcs.add(new DepthStatisticsCalculator(entry)); - } - offset++; - } - - return calcs; - } - - private void updateLocusInfo(List calcs, BufferedReader reader) { - - String docLocLine; - try { - docLocLine = reader.readLine(); - while ( ! isEndOfSection(docLocLine) ) { - int offset = -1; - for ( String entry : docLocLine.split("\t") ) { - if ( offset > -1 ) { - calcs.get(offset).updateLocus(Integer.parseInt(entry)); - } - offset++; - } - } - } catch ( IOException e) { - throw new StingException("Error reading locus depth of coverage information",e); - } - - } - - private void updateTargetInfo(List calcs, BufferedReader reader) { - - String docLocLine; - try { - docLocLine = reader.readLine(); - while ( ! isEndOfSection(docLocLine) ) { - int offset = -1; - int targetSize = 0; - for ( String entry : docLocLine.split("\t") ) { - if ( offset == -1 ) { - targetSize = parseInterval(entry); - } else { - calcs.get(offset).updateTargets(targetSize,Integer.parseInt(entry)); - } - offset++; - } - } - } catch ( IOException e ) { - throw new StingException("Error reading target depth of coverage information",e); - } - - } - - /////////////////////////////////////////////////////////////////////////////////// - // FILE IO METHODS -- DEPEND ON DEPTH OF COVERAGE FILE FORMAT - /////////////////////////////////////////////////////////////////////////////////// - - private boolean isEndOfSection( String line ) { - // sections delimited by empty line - return line.equalsIgnoreCase(""); - } - - private String getDOCSectionHeader(BufferedReader reader) { - String header; - try { - do { - header = reader.readLine(); - } while ( ! isDOCSectionSeparator(header) && header != null); - - } catch (IOException e) { - throw new StingException("Error reading depth of coverage file",e); - } - - return header; - } - - private boolean isDOCSectionSeparator( String line ) { - return line.contains("_COVERAGE_SECTION"); - } - - private int parseInterval(String interval) { - String startstop = interval.split(":")[1]; - int start = Integer.parseInt(startstop.split("-")[0]); - int stop = Integer.parseInt(startstop.split("-")[1]); - return stop - start; - } - -} - -/////////////////////////////////////////////////////////////////////////////////// -// PROGRAM START -- THE MAIN() METHOD AND WRAPPER CLASS -/////////////////////////////////////////////////////////////////////////////////// - -public class AnalyzeDepthOfCoverage { - - public static void main(String[] args) { - AnalyzeDepthCLP depthAnalysis = new AnalyzeDepthCLP(); - CommandLineProgram.start(depthAnalysis,args); - System.exit(0); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/DepthStatisticsCalculator.java b/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/DepthStatisticsCalculator.java deleted file mode 100644 index 097af88ce..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/DepthStatisticsCalculator.java +++ /dev/null @@ -1,132 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.firehosesummary; - -import java.util.ArrayList; -import java.util.Arrays; - -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date Feb 12, 2010 - */ -public class DepthStatisticsCalculator extends SummaryStatisticsCalculator { - private int numBasesAbove0x; // % of bases with ANY coverage - private int numBasesAbove3x; // % of bases with 4x+ - private int numBasesAbove9x; // % of bases with 10x+ - private int numBasesAbove24x; // % of bases with 25x+ - private int numBasesAbove49x; // % of bases with 50x+ - private int numBasesAbove99x; // % of bases with 100x+ - - private int targetsAbove0x; - private int targetsAbove3x; - private int targetsAbove9x; - private int targetsAbove24x; - private int targetsAbove49x; - private int targetsAbove99x; - private int numTargets; - - public static int[] DEPTH_CUTOFFS = {1,4,10,25,50,100}; - - public DepthStatisticsCalculator(String name) { - super(name); - numBasesAbove0x = 0; - numBasesAbove3x = 0; - numBasesAbove9x = 0; - numBasesAbove24x = 0; - numBasesAbove49x = 0; - numBasesAbove99x = 0; - targetsAbove99x = 0; - targetsAbove49x = 0; - targetsAbove24x = 0; - targetsAbove9x = 0; - targetsAbove3x = 0; - targetsAbove0x = 0; - numTargets = 0; - } - - public void updateLocus(int depth) { - super.update(depth); - - if ( depth > 99 ) { - numBasesAbove99x++; - } - if ( depth > 49 ) { - numBasesAbove49x++; - } - if ( depth > 24 ) { - numBasesAbove24x++; - } - if ( depth > 9 ) { - numBasesAbove9x++; - } - if ( depth > 3 ) { - numBasesAbove3x++; - } - if ( depth > 0 ) { - numBasesAbove0x++; - } - } - - public void updateTargets(int targetLength, int totalCoverage) { - double avgCvg = ( (double) totalCoverage ) / ( (double) targetLength); - - if ( avgCvg >= 100 ) { - targetsAbove99x ++; - } - if ( avgCvg >= 50 ) { - targetsAbove49x++; - } - if ( avgCvg >= 25 ) { - targetsAbove24x++; - } - if ( avgCvg >= 10 ) { - targetsAbove9x++; - } - if ( avgCvg >= 4 ) { - targetsAbove3x++; - } - if ( avgCvg >= 1 ) { - targetsAbove0x++; - } - - numTargets++; - } - - public double[] getLocusProportions() { - double[] proportions = new double[6]; - - proportions[0] = ( (double) numBasesAbove0x )/( (double) getSampleSize() ); - proportions[2] = ( (double) numBasesAbove9x )/( (double) getSampleSize() ); - proportions[3] = ( (double) numBasesAbove24x )/( (double) getSampleSize() ); - proportions[4] = ( (double) numBasesAbove49x )/( (double) getSampleSize() ); - proportions[5] = ( (double) numBasesAbove99x )/( (double) getSampleSize() ); - proportions[1] = ( (double) numBasesAbove3x )/( (double) getSampleSize() ); - - return proportions; - } - - public double[] getTargetProportions() { - double[] proportions = new double[6]; - - proportions[0] = ( (double) targetsAbove0x )/( (double) numTargets ); - proportions[2] = ( (double) targetsAbove9x )/( (double) numTargets ); - proportions[3] = ( (double) targetsAbove24x )/( (double) numTargets ); - proportions[4] = ( (double) targetsAbove49x )/( (double) numTargets ); - proportions[5] = ( (double) targetsAbove99x )/( (double) numTargets ); - proportions[1] = ( (double) targetsAbove3x )/( (double) numTargets ); - - return proportions; - } - - public double getPercentWellCoveredLoci() { - return 10*( (double) numBasesAbove9x )/( (double) getSampleSize() ); - } - - public double getPercentWellCoveredTargets() { - return 10*( (double) targetsAbove9x )/( (double) numTargets ); - } - - public int[] getEvalPoints() { - return DepthStatisticsCalculator.DEPTH_CUTOFFS; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/GenerateFirehoseSummary.java b/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/GenerateFirehoseSummary.java deleted file mode 100644 index 1ae1d6ff3..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/GenerateFirehoseSummary.java +++ /dev/null @@ -1,257 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.firehosesummary; - -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; - -import java.io.*; -import java.util.*; - -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date Feb 11, 2010 - */ - -class FirehoseSummaryCLP extends CommandLineProgram { - @Argument(fullName = "depthOfCoverageFile", shortName = "doc", doc="Path to the depth of coverage file", required=true) - private File depthOfCoverage = null; -// @Argument(fullName = "contaminationFile", shortName = "con", doc="Path to the contamination file", required=true) -// private File contamination = null; -// @Argument(fullName = "errorRateFile", shortName = "err", doc="Path to the error rate file", required=true) -// private File errorRate = null; -// @Argument(fullName = "zipFiles", shortName = "zip", doc="List of paths to zip files which contain summary metrics files", required=false) -// private String zipFiles = null; - - private static String R_SCRIPT = "plotFirehoseDataQCMetrics.R"; - private static String SCRIPT_DOC_FLAG = "DOC"; - - protected int execute() { -// SummaryFileCollection metricsFiles = getFileHandles(); - List depthStats = calculateDepthStatistics(depthOfCoverage); - String docSummary = makeDOCPlots(depthStats); - return 1; - } - - private String makeDOCPlots(List calcs) { - StringBuilder summaryBuilder = new StringBuilder(); - int numSamplesWithGoodBaseCoverage=0; - int numSamplesWithGoodMeanCoverage = 0; - int numSamplesWithGoodTargetCoverage=0; - double aggregateMeanCoverage = -1d; - double aggregateStdDevCoverage = -1d; - double aggregateSkewCoverage = -1d; - PrintWriter locusOutput; - PrintWriter targetOutput; - File locusOutputFile; - File targetOutputFile; - try { - locusOutputFile = File.createTempFile("locus_output","txt"); - locusOutput = new PrintWriter(locusOutputFile); - targetOutputFile = File.createTempFile("target_output","txt"); - targetOutput = new PrintWriter(targetOutputFile); - } catch ( IOException e ) { - throw new StingException("Error creating temporary output files for R-plotting. Perhaps user permissions do not allow creation of files.",e); - } - - for ( DepthStatisticsCalculator calc : calcs ) { - - if ( calc.getName().equalsIgnoreCase("total_coverage") || calc.getName().equalsIgnoreCase("coverage_without_deletions") ) { - // update summary statistics - if ( calc.getName().equalsIgnoreCase("total_coverage") ) { - aggregateMeanCoverage = calc.getMean(); - aggregateStdDevCoverage = Math.sqrt(calc.getVar()); - aggregateSkewCoverage = calc.getSkew(); - } - } else { - // update sample-based summary statistics and print output to file - locusOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n",calc.getName(),calc.getLocusProportions()); - targetOutput.printf("%s\t%f\t%f\t%f\t%f\t%f\t%f%n", calc.getName(),calc.getTargetProportions()); - if ( calc.getPercentWellCoveredLoci() > 80 ) { - numSamplesWithGoodBaseCoverage++; - } - if ( calc.getPercentWellCoveredTargets() > 90 ) { - numSamplesWithGoodTargetCoverage++; - } - if ( calc.getMean() > 10 ) { - numSamplesWithGoodMeanCoverage++; - } - } - } - - //invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfBasesCoveredBySample",locusOutputFile.getAbsolutePath()); - //invokeRScript(R_SCRIPT,SCRIPT_DOC_FLAG,"PercentOfTargetsCoveredBySample",targetOutputFile.getAbsolutePath()); - return "temporary"; - } - -// private SummaryFileCollection getFileHandles() { -// if ( zipFiles == null ) { -// return null; -// } -// -// SummaryFileCollection summaryFiles = new SummaryFileCollection(); -// for ( String zipFile : zipFiles.split(",") ) { -// summaryFiles.process(zipFile); -// } -// -// return summaryFiles; -// } - - private List calculateDepthStatistics(File docFile) { - BufferedReader docReader; - - try { - docReader = new BufferedReader( new FileReader(docFile) ); - } catch ( IOException e) { - throw new StingException("The file "+docFile.getAbsolutePath()+" could not be opened...",e); - } - - String locusHeader = getDOCSectionHeader(docReader); - List docCalculators = instantiateDOCCalculators(locusHeader); - - updateLocusInfo(docCalculators,docReader); - - String targetHeader = getDOCSectionHeader(docReader); - - updateTargetInfo(docCalculators,docReader); - - return docCalculators; - } - - private void updateLocusInfo(List calcs, BufferedReader reader) { - - String docLocLine; - try { - docLocLine = reader.readLine(); - while ( ! isEndOfSection(docLocLine) ) { - int offset = -1; - for ( String entry : docLocLine.split("\t") ) { - if ( offset > -1 ) { - calcs.get(offset).updateLocus(Integer.parseInt(entry)); - } - offset++; - } - } - } catch ( IOException e) { - throw new StingException("Error reading locus depth of coverage information",e); - } - - } - - private void updateTargetInfo(List calcs, BufferedReader reader) { - - String docLocLine; - try { - docLocLine = reader.readLine(); - while ( ! isEndOfSection(docLocLine) ) { - int offset = -1; - int targetSize = 0; - for ( String entry : docLocLine.split("\t") ) { - if ( offset == -1 ) { - targetSize = parseInterval(entry); - } else { - calcs.get(offset).updateTargets(targetSize,Integer.parseInt(entry)); - } - offset++; - } - } - } catch ( IOException e ) { - throw new StingException("Error reading target depth of coverage information",e); - } - - } - - private int parseInterval(String interval) { - String startstop = interval.split(":")[1]; - int start = Integer.parseInt(startstop.split("-")[0]); - int stop = Integer.parseInt(startstop.split("-")[1]); - return stop - start; - } - - private boolean isDOCSectionSeparator( String line ) { - return line.contains("_COVERAGE_SECTION"); - } - - private boolean isEndOfSection( String line ) { - // sections delimited by empty line - return line.equalsIgnoreCase(""); - } - - private String getDOCSectionHeader(BufferedReader reader) { - String header; - try { - do { - header = reader.readLine(); - } while ( ! isDOCSectionSeparator(header) ); - - header = reader.readLine(); - - } catch (IOException e) { - throw new StingException("Error reading depth of coverage file",e); - } - - return header; - } - - private List instantiateDOCCalculators(String header) { - List calcs = new ArrayList(); - - int offset = -1; - for ( String entry : header.split("\t") ) { - if ( offset > -1 ) { - calcs.add(new DepthStatisticsCalculator(entry)); - } - offset++; - } - - return calcs; - } - -} - -public class GenerateFirehoseSummary { - - public static void main(String[] args) { - FirehoseSummaryCLP clp = new FirehoseSummaryCLP(); - CommandLineProgram.start(clp, args); - System.exit(0); - } -} - -//class SummaryFileCollection { -// -// // container class for files we'll be summarizing -// -// public Map fingerprintSummaryFiles; -// public Map hybridSelectionMetricsFiles; -// public Map insertSizeDistributionFiles; -// public Map alignmentMetricsFiles; -// -// public SummaryFileCollection() { -// fingerprintSummaryFiles = new HashMap(); -// hybridSelectionMetricsFiles = new HashMap(); -// insertSizeDistributionFiles = new HashMap(); -// alignmentMetricsFiles = new HashMap(); -// } -// -// public void process(String zipFilePath) { -// String sampleName = zipFilePath.split("_sequencing_metrics.zip")[0].split("_")[1]; -// File fingerprintSummaryFile = new File(sampleName+".summary_fingerprint_metrics"); -// File hybridSelectionFile = new File(sampleName+".hybrid_selection_metrics"); -// File insertSizeFile = new File(sampleName+".insert_size_metrics"); -// File alignmentFile = new File(sampleName+".alignment_metrics"); -// -// String command = "unzip "+zipFilePath; -// try { -// Process p = Runtime.getRuntime().exec(command); -// } catch (IOException e) { -// throw new RuntimeException("Could not unzip the file "+zipFilePath); -// } -// -// fingerprintSummaryFiles.put(sampleName,fingerprintSummaryFile); -// hybridSelectionMetricsFiles.put(sampleName,hybridSelectionFile); -// insertSizeDistributionFiles.put(sampleName,insertSizeFile); -// alignmentMetricsFiles.put(sampleName,alignmentFile); -// } -//} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/SummaryStatisticsCalculator.java b/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/SummaryStatisticsCalculator.java deleted file mode 100644 index af8d1dcb0..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/firehosesummary/SummaryStatisticsCalculator.java +++ /dev/null @@ -1,69 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.firehosesummary; - -/** - * This object calculates the first three sample moments of a data set on-the-fly - * @Author chartl - * @Date Feb 12, 2010 - */ -public class SummaryStatisticsCalculator { - // todo -- can median / quantiles be estimated on-the-fly? - // todo -- can we divine a metric for modality? - private double mean; // mean of the samples fed to it - private double var; // variance of the samples fed to it - private double skew; // skew of the samples fed to it - private int sampleSize; // number of samples given to calculator - private String name; // name to associate calculator with, if any - - public SummaryStatisticsCalculator() { - mean = 0; - var = 0; - skew = 0; - sampleSize = 0; - name = "SUMMARY"; - } - - public SummaryStatisticsCalculator(String name) { - mean = 0; - var = 0; - skew = 0; - sampleSize = 0; - this.name = name; - } - - public void update(double datum) { - mean = (sampleSize*mean + datum)/(sampleSize + 1); - var = (sampleSize*var + Math.pow( mean - datum , 2 ) )/( sampleSize + 1); - // note: the variance is not re-calculated over all data points each time the mean changes - // this means the convergence will be slower than x^-(1/2); but it will still converge with the mean - // whole exome is ~3.3 million datapoints, which will be plenty for this to converge to within a very - // small proportion of the true sample variance - skew = ( skew*Math.pow(var,2/3)*sampleSize+ Math.pow( datum - mean, 3) ) / ( ( sampleSize + 1 )* Math.pow(var,2/3) ); - // again, new mean/variance estimates are not propagated over all previous data points, so convergence is a bit slower - // but that's the price one pays for on-the-fly calculation - sampleSize++; - } - - public void update(int datum) { - update( (double) datum); - } - - public double getMean() { - return mean; - } - - public double getVar() { - return var; - } - - public double getSkew() { - return skew; - } - - public String getName() { - return name; - } - - public int getSampleSize() { - return sampleSize; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java new file mode 100644 index 000000000..87a53084a --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CoverageStatistics.java @@ -0,0 +1,437 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * A parallelizable walker designed to quickly aggregate relevant coverage statistics across samples in the input + * file. Assesses the mean and median granular coverages of each sample, and generates part of a cumulative + * distribution of % bases and % targets covered for certain depths. The granularity of DOC can be set by command + * line arguments. + * + * // todo -- allow for user to set linear binning (default is logarithmic) + * // todo -- add per target (e.g. regional) aggregation + * + * @Author chartl + * @Date Feb 22, 2010 + */ +@By(DataSource.REFERENCE) +public class CoverageStatistics extends LocusWalker, DepthOfCoverageStats> implements TreeReducible { + @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) + int start = 1; + @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) + int stop = 500; + @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false) + int nBins = 20; + @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false) + byte minMappingQuality = 50; + @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false) + byte minBaseQuality = 20; + @Argument(fullName = "perLocusStatisticsFile", shortName = "locusFile", doc = "File to output per-locus statistics to; if unprovided these will not be calculated") + File perLocusStatisticsFile = null; + @Argument(fullName = "perSampleStatisticsFile", shortName = "sampleFile", doc = "File to output per-sample statistics to; if unprovided will go to standard (-o) output") + File perSampleStatisticsFile = null; + @Argument(fullName = "summaryStatisticsFile", shortName = "summaryFile", doc = "File to output summary (mean, median) statistics to; if unprovided will go to standard (-o) output") + File summaryStatisticsFile = null; + + //////////////////////////////////////////////////////////////////////////////////// + // STANDARD WALKER METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public DepthOfCoverageStats reduceInit() { + List> samplesByReaders = getToolkit().getSamplesByReaders(); + DepthOfCoverageStats stats = new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins)); + + for ( Set sampleSet : samplesByReaders ) { + for ( String sample : sampleSet ) { + stats.addSample(sample); + } + } + + if ( perLocusStatisticsFile != null ) { + stats.initializeLocusCounts(); + } + + return stats; + } + + public Map map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + Map contextsBySample = + StratifiedAlignmentContext.splitContextBySample(context.getBasePileup()); + HashMap depthBySample = new HashMap(); + + for ( String sample : contextsBySample.keySet() ) { + AlignmentContext sampleContext = contextsBySample.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + int properDepth = 0; + for ( PileupElement e : sampleContext.getBasePileup() ) { + if ( e.getQual() >= minBaseQuality && e.getMappingQual() >= minMappingQuality ) { + properDepth++; + } + } + + depthBySample.put(sample,properDepth); + } + + return depthBySample; + } + + public DepthOfCoverageStats reduce(Map thisMap, DepthOfCoverageStats prevReduce) { + prevReduce.update(thisMap); + return prevReduce; + } + + public DepthOfCoverageStats treeReduce(DepthOfCoverageStats left, DepthOfCoverageStats right) { + left.merge(right); + return left; + } + + public void onTraversalDone(DepthOfCoverageStats coverageProfiles) { + printSummary(out,summaryStatisticsFile,coverageProfiles); + printPerSample(out,perSampleStatisticsFile,coverageProfiles); + printPerLocus(perLocusStatisticsFile,coverageProfiles); + } + + //////////////////////////////////////////////////////////////////////////////////// + // HELPER OUTPUT METHODS + //////////////////////////////////////////////////////////////////////////////////// + + private void printPerSample(PrintStream out, File optionalFile, DepthOfCoverageStats stats) { + PrintStream output = getCorrectStream(out,optionalFile); + int[] leftEnds = stats.getEndpoints(); + StringBuilder hBuilder = new StringBuilder(); + hBuilder.append("\t"); + hBuilder.append(String.format("[0,%d)\t",leftEnds[0])); + for ( int i = 1; i < leftEnds.length; i++ ) + hBuilder.append(String.format("[%d,%d)\t",leftEnds[i-1],leftEnds[i])); + hBuilder.append(String.format("[%d,inf)%n",leftEnds[leftEnds.length-1])); + output.print(hBuilder.toString()); + Map histograms = stats.getHistograms(); + for ( String s : histograms.keySet() ) { + StringBuilder sBuilder = new StringBuilder(); + sBuilder.append(String.format("%s",s)); + for ( int count : histograms.get(s) ) { + sBuilder.append(String.format("\t%d",count)); + } + sBuilder.append(String.format("%n")); + output.print(sBuilder.toString()); + } + } + + private void printPerLocus(File locusFile, DepthOfCoverageStats stats) { + PrintStream output = getCorrectStream(null,locusFile); + if ( output == null ) { + return; + } + + int[] endpoints = stats.getEndpoints(); + int samples = stats.getHistograms().size(); + + int[][] baseCoverageCumDist = stats.getLocusCounts(); + + // rows - # of samples + // columns - depth of coverage + + StringBuilder header = new StringBuilder(); + for ( int d : endpoints ) { + header.append(String.format("\t%d",d)); + } + header.append(String.format("%n")); + + output.print(header); + + for ( int row = samples; row > 0; row ++ ) { + output.printf("%s_%d\t","NSamples",row); + for ( int depthBin = 0; depthBin < baseCoverageCumDist[0].length; depthBin ++ ) { + output.printf("%d\t",baseCoverageCumDist[row][depthBin]); + } + output.printf("%n"); + } + } + + private PrintStream getCorrectStream(PrintStream out, File optionalFile) { + PrintStream output; + if ( optionalFile == null ) { + output = out; + } else { + try { + output = new PrintStream(optionalFile); + } catch ( IOException e ) { + logger.warn("Error opening the output file "+optionalFile.getAbsolutePath()+". Defaulting to stdout"); + output = out; + } + } + + return output; + } + + private void printSummary(PrintStream out, File optionalFile, DepthOfCoverageStats stats) { + PrintStream output = getCorrectStream(out,optionalFile); + + output.printf("%s\t%s\t%s\t%s\t%s%n","sample_id","mean","granular_third_quartile","granular_median","granular_first_quartile"); + + Map histograms = stats.getHistograms(); + Map means = stats.getMeans(); + int[] leftEnds = stats.getEndpoints(); + + for ( String s : histograms.keySet() ) { + int median = getQuantile(histograms.get(s),0.5); + int q1 = getQuantile(histograms.get(s),0.25); + int q3 = getQuantile(histograms.get(s),0.75); + output.printf("%s\t%.2f\t%d\t%d\t%d%n",s,means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); + } + + output.printf("%s\t%.2f\t%s\t%s\t%s%n","Total",means.get(DepthOfCoverageStats.ALL_SAMPLES),"N/A","N/A","N/A"); + } + + private int getQuantile(int[] histogram, double prop) { + int total = 0; + + for ( int i = 0; i < histogram.length; i ++ ) { + total += histogram[i]; + } + + int counts = 0; + int bin = -1; + while ( counts < prop*total ) { + counts += histogram[bin+1]; + bin++; + } + + return bin; + } +} + +class DepthOfCoverageStats { + public static String ALL_SAMPLES = "ALL_COMBINED_SAMPLES"; + // STANDARD (constantly updated) DATA + private Map granularHistogramBySample; // holds the counts per each bin + private Map meanCoverages; // holds mean coverage per sample + private int[] binLeftEndpoints; // describes the left endpoint for each bin + private int[][] locusCoverageCounts; // holds counts of number of bases with >=X samples at >=Y coverage + private boolean tabulateLocusCounts = false; + private int nLoci; // number of loci seen + // TEMPORARY DATA (not worth re-instantiating every time) + private int[] locusHistogram; // holds a histogram for each locus; reset after each update() call + private int totalDepth; // holds the total depth of coverage for each locus; reset after each update() call + + //////////////////////////////////////////////////////////////////////////////////// + // STATIC METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public static int[] calculateBinEndpoints(int lower, int upper, int bins) { + if ( bins > upper - lower || lower < 1 ) { + throw new IllegalArgumentException("Illegal argument to calculateBinEndpoints; "+ + "lower bound must be at least 1, and number of bins may not exceed stop - start"); + } + + int[] binLeftEndpoints = new int[bins+1]; + binLeftEndpoints[0] = 0; // depth < start + + int length = upper - lower; + double scale = Math.log10((double) length)/bins; + + for ( int b = 1; b < bins ; b++ ) { + int leftEnd = (int) Math.floor(Math.pow(10.0,(b-1.0)*scale)); + + while ( leftEnd <= binLeftEndpoints[b-1] ) { + leftEnd++; + } + + binLeftEndpoints[b] = leftEnd; + } + + return binLeftEndpoints; + } + + //////////////////////////////////////////////////////////////////////////////////// + // INITIALIZATION METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public DepthOfCoverageStats(int[] leftEndpoints) { + this.binLeftEndpoints = leftEndpoints; + granularHistogramBySample = new HashMap(); + meanCoverages = new HashMap(); + nLoci = 0; + totalDepth = 0; + } + + public void addSample(String sample) { + if ( granularHistogramBySample.containsKey(sample) ) { + return; + } + + int[] binCounts = new int[this.binLeftEndpoints.length]; + for ( int b = 0; b < binCounts.length; b ++ ) { + binCounts[b] = 0; + } + + granularHistogramBySample.put(sample,binCounts); + meanCoverages.put(sample,0.0); + } + + public void initializeLocusCounts() { + locusCoverageCounts = new int[granularHistogramBySample.size()][binLeftEndpoints.length]; + locusHistogram = new int[binLeftEndpoints.length]; + for ( int b = 0; b < binLeftEndpoints.length; b ++ ) { + for ( int a = 0; a < granularHistogramBySample.size(); a ++ ) { + locusCoverageCounts[a][b] = 0; + } + locusHistogram[b] = 0; + } + + tabulateLocusCounts = true; + } + + //////////////////////////////////////////////////////////////////////////////////// + // UPDATE METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public void update(Map depthBySample) { + int b; + for ( String sample : granularHistogramBySample.keySet() ) { + if ( depthBySample.containsKey(sample) ) { + b = updateSample(sample,depthBySample.get(sample)); + totalDepth += depthBySample.get(sample); + } else { + b = updateSample(sample,0); + } + + if ( tabulateLocusCounts ) { + for ( int i = 0; i <= b; i ++ ) { + locusHistogram[i]++; + } + } + } + + double meanDepth = meanCoverages.get(DepthOfCoverageStats.ALL_SAMPLES); + double newMean = ( meanDepth*nLoci + (double) totalDepth )/( nLoci + 1 ); + meanCoverages.put(DepthOfCoverageStats.ALL_SAMPLES,newMean); + updateLocusCounts(locusHistogram); + + nLoci++; + totalDepth = 0; + } + + private int updateSample(String sample, int depth) { + double mean = meanCoverages.get(sample); + double newMean = ( nLoci*mean + (double) depth )/(nLoci + 1.0); + meanCoverages.put(sample,newMean); + + int[] granularBins = granularHistogramBySample.get(sample); + for ( int b = 1; b < granularBins.length; b ++ ) { + if ( depth < binLeftEndpoints[b] ) { + granularBins[b-1]++; + return b ; + } + } + + granularBins[granularBins.length-1]++; // greater than all left-endpoints + return granularBins.length-1; + } + + public void merge(DepthOfCoverageStats newStats) { + this.mergeSamples(newStats); + if ( this.tabulateLocusCounts && newStats.tabulateLocusCounts ) { + this.mergeLocusCounts(newStats.getLocusCounts()); + } + + double totalMean = (meanCoverages.get(DepthOfCoverageStats.ALL_SAMPLES)*nLoci + + newStats.getMeans().get(DepthOfCoverageStats.ALL_SAMPLES)*newStats.getTotalLoci()) / + ( nLoci + newStats.getTotalLoci()); + + meanCoverages.put(DepthOfCoverageStats.ALL_SAMPLES,totalMean); + nLoci += newStats.getTotalLoci(); + } + + private void mergeSamples(DepthOfCoverageStats otherStats) { + Map otherHistogram = otherStats.getHistograms(); + Map otherMeans = otherStats.getMeans(); + for ( String s : granularHistogramBySample.keySet() ) { + int[] internalCounts = granularHistogramBySample.get(s); + int[] externalCounts = otherHistogram.get(s); + for ( int b = 0; b < internalCounts.length; b++ ) { + internalCounts[b] += externalCounts[b]; + } + + double internalMean = meanCoverages.get(s); + double externalMean = otherMeans.get(s); + double newMean = ( internalMean*nLoci + externalMean*otherStats.getTotalLoci())/(nLoci+otherStats.getTotalLoci()); + + meanCoverages.put(s,newMean); + } + } + + private void mergeLocusCounts( int[][] otherCounts ) { + for ( int a = 0; a < locusCoverageCounts.length; a ++ ) { + for ( int b = 0; b < locusCoverageCounts[0].length; b ++ ) { + locusCoverageCounts[a][b] += otherCounts[a][b]; + } + } + } + + /* + * Update locus counts -- takes an array in which the number of samples + * with depth ABOVE [i] is held. So if the bin left endpoints were 2, 5, 10 + * then we'd have an array that represented: + * [# samples with depth 0 - inf], [# samples with depth 2 - inf], + * [# samples with depth 5 - inf], [# samples with depth 10-inf]; + * + * this is + * @argument cumulativeSamplesByDepthBin - see above + */ + private void updateLocusCounts(int[] cumulativeSamplesByDepthBin) { + if ( tabulateLocusCounts ) { + for ( int bin = 0; bin < cumulativeSamplesByDepthBin.length; bin ++ ) { + int numSamples = cumulativeSamplesByDepthBin[bin]; + for ( int i = 0; i < numSamples; i ++ ) { + locusCoverageCounts[i][bin]++; + } + + cumulativeSamplesByDepthBin[bin] = 0; // reset counts in advance of next update() + } + } + } + + //////////////////////////////////////////////////////////////////////////////////// + // ACCESSOR METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public Map getHistograms() { + return granularHistogramBySample; + } + + public int[][] getLocusCounts() { + return locusCoverageCounts; + } + + public int[] getEndpoints() { + return binLeftEndpoints; + } + + public Map getMeans() { + return meanCoverages; + } + + public int getTotalLoci() { + return nLoci; + } + +}