From a003148d5044b2b849712b82e0598187461e7153 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 16 Jul 2012 16:11:56 -0400 Subject: [PATCH] Move AnalyzeCovariates over too. --- .../AnalysisDataManager.java | 113 ------ .../analyzecovariates/AnalyzeCovariates.java | 383 ------------------ .../sting/analyzecovariates/package-info.java | 4 - 3 files changed, 500 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java delete mode 100755 public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java delete mode 100644 public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java deleted file mode 100755 index 1230c86be..000000000 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.analyzecovariates; - -import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; -import org.broadinstitute.sting.utils.collections.NestedHashMap; - -import java.util.ArrayList; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Dec 1, 2009 - * - * The difference between this AnalysisDataManager and the RecalDataManager used by the Recalibration walkers is that here the collapsed data tables are indexed - * by only read group and the given covariate, while in the recalibrator the collapsed tables are indexed by read group, reported quality, and the given covariate. - */ - -public class AnalysisDataManager { - - private NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed - private ArrayList dataCollapsedByCovariate; // Tables where everything except read group and given covariate has been collapsed - - AnalysisDataManager() { - } - - AnalysisDataManager( final int numCovariates ) { - dataCollapsedReadGroup = new NestedHashMap(); - dataCollapsedByCovariate = new ArrayList(); - for( int iii = 0; iii < numCovariates - 1; iii++ ) { // readGroup isn't counted here, its table is separate - dataCollapsedByCovariate.add( new NestedHashMap() ); - } - } - - /** - * Add the given mapping to all of the collapsed hash tables - * @param key The list of comparables that is the key for this mapping - * @param fullDatum The RecalDatum which is the data for this mapping - * @param IGNORE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table - */ - public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int IGNORE_QSCORES_LESS_THAN ) { - - final int qscore = Integer.parseInt( key[1].toString() ); - RecalDatum collapsedDatum; - final Object[] readGroupCollapsedKey = new Object[1]; - final Object[] covariateCollapsedKey = new Object[2]; - - if( !(qscore < IGNORE_QSCORES_LESS_THAN) ) { - // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed - readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group - collapsedDatum = (RecalDatum)dataCollapsedReadGroup.get( readGroupCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey ); - } else { - collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported - } - } - - // Create dataCollapsedByCovariate's, the tables where everything except read group and given covariate has been collapsed - for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { - if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle and etc. - covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ... - Object theCovariateElement = key[iii + 1]; // and the given covariate - if( theCovariateElement != null ) { - covariateCollapsedKey[1] = theCovariateElement; - collapsedDatum = (RecalDatum)dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey ); - } else { - collapsedDatum.combine( fullDatum ); - } - } - } - } - } - - /** - * Get the appropriate collapsed table out of the set of all the tables held by this Object - * @param covariate Which covariate indexes the desired collapsed HashMap - * @return The desired collapsed HashMap - */ - public final NestedHashMap getCollapsedTable( final int covariate ) { - if( covariate == 0) { - return dataCollapsedReadGroup; // Table where everything except read group has been collapsed - } else { - return dataCollapsedByCovariate.get( covariate - 1 ); // Table where everything except read group, quality score, and given covariate has been collapsed - } - } - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java deleted file mode 100755 index 59a3d8cdb..000000000 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.analyzecovariates; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; -import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; -import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.R.RScriptExecutor; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.io.Resource; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.*; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Map; -import java.util.regex.Pattern; - -/** - * Call R scripts to plot residual error versus the various covariates. - * - *

- * After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which - * reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically - * show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities). - * In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run - * CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated - * by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual - * error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated - * bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration. - * - *

- * The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into - * each of the quality score estimates. It is defined as follows for N, the number of observations: - * - *

    - *
  • light blue means N < 1,000
  • - *
  • cornflower blue means 1,000 <= N < 10,000
  • - *
  • dark blue means N >= 10,000
  • - *
  • The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically - * meaningless and so aren't included in any of the numerical calculations.
  • - *
- * - *

- * NOTE: Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). - * See http://www.r-project.org for more info on how to download and install R. - * - *

- * See the GATK wiki for a tutorial and example recalibration accuracy plots. - * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration - * - *

Input

- *

- * The recalibration table file in CSV format that was generated by the CountCovariates walker. - *

- * - *

Examples

- *
- * java -Xmx4g -jar AnalyzeCovariates.jar \
- *   -recalFile /path/to/recal.table.csv  \
- *   -outputDir /path/to/output_dir/  \
- *   -ignoreQ 5
- * 
- * - */ - -@DocumentedGATKFeature( - groupName = "AnalyzeCovariates", - summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator") -public class AnalyzeCovariates extends CommandLineProgram { - final private static Logger logger = Logger.getLogger(AnalyzeCovariates.class); - - private static final String PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE = "plot_residualError_QualityScoreCovariate.R"; - private static final String PLOT_RESDIUAL_ERROR_OTHER_COVARIATE = "plot_residualError_OtherCovariate.R"; - private static final String PLOT_INDEL_QUALITY_RSCRIPT = "plot_indelQuality.R"; - - ///////////////////////////// - // Command Line Arguments - ///////////////////////////// - /** - * After the header, data records occur one per line until the end of the file. The first several items on a line are the - * values of the individual covariates and will change depending on which covariates were specified at runtime. The last - * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, - * and the raw empirical quality score calculated by phred-scaling the mismatch rate. - */ - @Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false) - private String RECAL_FILE = "output.recal_data.csv"; - @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false) - private File OUTPUT_DIR = new File("analyzeCovariates"); - @Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false) - private int IGNORE_QSCORES_LESS_THAN = 5; - @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false) - private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups - - /** - * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation - * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later. - */ - @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 50") - private int MAX_QUALITY_SCORE = 50; - - /** - * This argument is useful for comparing before/after plots and you want the axes to match each other. - */ - @Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots") - private int MAX_HISTOGRAM_VALUE = 0; - - @Hidden - @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting") - private boolean DO_INDEL_QUALITY = false; - - ///////////////////////////// - // Private Member Variables - ///////////////////////////// - private AnalysisDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps - private ArrayList requestedCovariates; // List of covariates to be used in this calculation - private final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - private final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); - private final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); - protected static final String EOF_MARKER = "EOF"; - - protected int execute() { - - // create the output directory where all the data tables and plots will go - if (!OUTPUT_DIR.exists() && !OUTPUT_DIR.mkdirs()) - throw new UserException.BadArgumentValue("--output_dir/-outDir", "Unable to create output directory: " + OUTPUT_DIR); - - if (!RScriptExecutor.RSCRIPT_EXISTS) - Utils.warnUser(logger, "Rscript not found in environment path. Plots will not be generated."); - - // initialize all the data from the csv file and allocate the list of covariates - logger.info("Reading in input csv file..."); - initializeData(); - logger.info("...Done!"); - - // output data tables for Rscript to read in - logger.info("Writing out intermediate tables for R..."); - writeDataTables(); - logger.info("...Done!"); - - // perform the analysis using Rscript and output the plots - logger.info("Calling analysis R scripts and writing out figures..."); - callRScripts(); - logger.info("...Done!"); - - return 0; - } - - private void initializeData() { - - // Get a list of all available covariates - Collection> classes = new PluginManager(Covariate.class).getPlugins(); - - int lineNumber = 0; - boolean foundAllCovariates = false; - - // Read in the covariates that were used from the input file - requestedCovariates = new ArrayList(); - - try { - for ( final String line : new XReadLines(new File( RECAL_FILE )) ) { - lineNumber++; - if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() || line.equals(EOF_MARKER) ) { - ; // Skip over the comment lines, (which start with '#') - } - else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data - if( foundAllCovariates ) { - throw new RuntimeException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); - } else { // Found the covariate list in input file, loop through all of them and instantiate them - String[] vals = line.split(","); - for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical - boolean foundClass = false; - for( Class covClass : classes ) { - if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { - foundClass = true; - try { - Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - } - - if( !foundClass ) { - throw new RuntimeException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); - } - } - - } - - } else { // Found a line of data - if( !foundAllCovariates ) { - - foundAllCovariates = true; - - // At this point all the covariates should have been found and initialized - if( requestedCovariates.size() < 2 ) { - throw new RuntimeException( "Malformed input recalibration file. Covariate names can't be found in file: " + RECAL_FILE ); - } - - // Initialize any covariate member variables using the shared argument collection - for( Covariate cov : requestedCovariates ) { - cov.initialize( new RecalibrationArgumentCollection() ); - } - - // Initialize the data hashMaps - dataManager = new AnalysisDataManager( requestedCovariates.size() ); - - } - addCSVData(line); // Parse the line and add the data to the HashMap - } - } - - } catch ( FileNotFoundException e ) { - throw new RuntimeException("Can not find input file: " + RECAL_FILE); - } catch ( NumberFormatException e ) { - throw new RuntimeException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); - } - } - - private void addCSVData(String line) { - String[] vals = line.split(","); - - // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical - throw new RuntimeException("Malformed input recalibration file. Found data line with too many fields: " + line + - " --Perhaps the read group string contains a comma and isn't being parsed correctly."); - } - - Object[] key = new Object[requestedCovariates.size()]; - Covariate cov; - int iii; - for( iii = 0; iii < requestedCovariates.size(); iii++ ) { - cov = requestedCovariates.get( iii ); - key[iii] = cov.getValue( vals[iii] ); - } - // Create a new datum using the number of observations, number of mismatches, and reported quality score - final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); - // Add that datum to all the collapsed tables which will be used in the sequential calculation - dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN ); - } - - private void writeDataTables() { - - int numReadGroups = 0; - - // for each read group - for( final Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet() ) { - - if( NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS ) { - final String readGroup = readGroupKey.toString(); - final RecalDatum readGroupDatum = (RecalDatum) dataManager.getCollapsedTable(0).data.get(readGroupKey); - logger.info(String.format( - "Writing out data tables for read group: %s\twith %s observations\tand aggregate residual error = %.3f", - readGroup, readGroupDatum.getNumObservations(), - readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported())); - - // for each covariate - for( int iii = 1; iii < requestedCovariates.size(); iii++ ) { - Covariate cov = requestedCovariates.get(iii); - - // Create a PrintStream - File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat"); - PrintStream output; - try { - output = new PrintStream(FileUtils.openOutputStream(outputFile)); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(outputFile, e); - } - - try { - // Output the header - output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases"); - - for( final Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet() ) { - output.print( covariateKey.toString() + "\t" ); // Covariate - final RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey); - output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported - output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical - output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches - output.println( thisDatum.getNumObservations() ); // nBases - } - } finally { - // Close the PrintStream - IOUtils.closeQuietly(output); - } - } - } else { - break; - } - - } - } - - private void callRScripts() { - int numReadGroups = 0; - - // for each read group - for( Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet() ) { - if(++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) { - - String readGroup = readGroupKey.toString(); - logger.info("Analyzing read group: " + readGroup); - - // for each covariate - for( int iii = 1; iii < requestedCovariates.size(); iii++ ) { - final Covariate cov = requestedCovariates.get(iii); - final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat"); - if (DO_INDEL_QUALITY) { - RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(PLOT_INDEL_QUALITY_RSCRIPT, AnalyzeCovariates.class)); - // The second argument is the name of the covariate in order to make the plots look nice - executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]); - executor.exec(); - } else { - if( iii == 1 ) { - // Analyze reported quality - RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE, AnalyzeCovariates.class)); - // The second argument is the Q scores that should be turned pink in the plot because they were ignored - executor.addArgs(outputFile, IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE); - executor.exec(); - } else { // Analyze all other covariates - RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_OTHER_COVARIATE, AnalyzeCovariates.class)); - // The second argument is the name of the covariate in order to make the plots look nice - executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]); - executor.exec(); - } - } - } - } else { // at the maximum number of read groups so break out - break; - } - } - } - - public static void main(String args[]) { - try { - AnalyzeCovariates clp = new AnalyzeCovariates(); - start(clp, args); - System.exit(CommandLineProgram.result); - } catch (Exception e) { - exitSystemWithError(e); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java deleted file mode 100644 index 9350e4a66..000000000 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java +++ /dev/null @@ -1,4 +0,0 @@ -/** - * Package to plot residual accuracy versus error covariates for the base quality score recalibrator. - */ -package org.broadinstitute.sting.analyzecovariates; \ No newline at end of file