2010-01-16 05:52:34 +08:00
/ *
2010-04-20 07:00:08 +08:00
* Copyright ( c ) 2010 The Broad Institute
2010-04-20 23:26:32 +08:00
*
2010-01-16 05:52:34 +08:00
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
2010-04-20 23:26:32 +08:00
* files ( the "Software" ) , to deal in the Software without
2010-01-16 05:52:34 +08:00
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
2010-04-20 23:26:32 +08:00
*
2010-01-16 05:52:34 +08:00
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
2010-04-20 23:26:32 +08:00
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
2010-01-16 05:52:34 +08:00
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
2010-04-20 07:00:08 +08:00
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
2010-01-16 05:52:34 +08:00
* /
2010-04-20 07:00:08 +08:00
package org.broadinstitute.sting.analyzecovariates ;
2011-07-18 08:29:58 +08:00
import org.broadinstitute.sting.commandline.Argument ;
import org.broadinstitute.sting.commandline.CommandLineProgram ;
2010-08-10 00:42:48 +08:00
import org.broadinstitute.sting.commandline.Input ;
2011-07-18 08:29:58 +08:00
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate ;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum ;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection ;
2011-08-28 00:54:13 +08:00
import org.broadinstitute.sting.utils.R.RScriptExecutor ;
2010-11-13 04:14:28 +08:00
import org.broadinstitute.sting.utils.classloader.PluginManager ;
2010-09-10 23:25:30 +08:00
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException ;
2011-08-18 22:57:48 +08:00
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature ;
2010-04-20 07:00:08 +08:00
import org.broadinstitute.sting.utils.text.XReadLines ;
2011-07-18 08:29:58 +08:00
import java.io.* ;
2010-04-20 07:00:08 +08:00
import java.util.ArrayList ;
2011-08-28 00:54:13 +08:00
import java.util.Arrays ;
2010-11-13 04:14:28 +08:00
import java.util.Collection ;
2010-04-20 07:00:08 +08:00
import java.util.Map ;
import java.util.regex.Pattern ;
2009-12-03 00:47:35 +08:00
/ * *
2011-08-18 22:57:48 +08:00
* Call R scripts to plot residual error versus the various covariates .
2009-12-12 05:57:18 +08:00
*
2011-08-18 22:57:48 +08:00
* < p >
* After counting covariates in either the initial BAM File or again in the recalibrated BAM File , an analysis tool is available which
* reads the . csv file and outputs several PDF ( and . dat ) files for each read group in the given BAM . These PDF files graphically
* show the various metrics and characteristics of the reported quality scores ( often in relation to the empirical qualities ) .
* In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run
* CountCovariates again on a bam file produced by TableRecalibration . In this way users can compare the analysis plots generated
* by pre - recalibration and post - recalibration . csv files . Our usual chain of commands that we use to generate plots of residual
* error is : CountCovariates , TableRecalibrate , samtools index on the recalibrated bam file , CountCovariates again on the recalibrated
* bam file , and then AnalyzeCovariates on both the before and after recal_data . csv files to see the improvement in recalibration .
*
* < p >
* The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into
* each of the quality score estimates . It is defined as follows for N , the number of observations :
*
* < ul >
* < li > light blue means N < 1 , 000 < / li >
* < li > cornflower blue means 1 , 000 < = N < 10 , 000 < / li >
* < li > dark blue means N > = 10 , 000 < / li >
* < li > The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically
* meaningless and so aren ' t included in any of the numerical calculations . < / li >
* < / ul >
*
* < p >
* NOTE : For those running this tool externally from the Broad , it is crucial to note that both the - Rscript and - resources options
* must be changed from the default . - Rscript needs to point to your installation of Rscript ( this is the scripting version of R ,
* not the interactive version ) while - resources needs to point to the folder holding the R scripts that are used . For those using
* this tool as part of the Binary Distribution the - resources should point to the resources folder that is part of the tarball .
* For those using this tool by building from the git repository the - resources should point to the R / subdirectory of the Sting checkout .
*
* < p >
* See the GATK wiki for a tutorial and example recalibration accuracy plots .
* http : //www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
*
* < h2 > Input < / h2 >
* < p >
* The recalibration table file in CSV format that was generated by the CountCovariates walker .
* < / p >
*
* < h2 > Examples < / h2 >
* < pre >
* java - Xmx4g - jar AnalyzeCovariates . jar \
* - recalFile / path / to / recal . table . csv \
* - outputDir / path / to / output_dir / \
* - resources resources / \
* - ignoreQ 5
* < / pre >
2011-08-28 00:54:13 +08:00
*
2009-12-03 00:47:35 +08:00
* /
2011-08-18 22:57:48 +08:00
@DocumentedGATKFeature (
2011-08-28 00:54:13 +08:00
groupName = "AnalyzeCovariates" ,
summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator" )
2011-03-24 22:03:51 +08:00
public class AnalyzeCovariates extends CommandLineProgram {
2009-12-03 00:47:35 +08:00
/////////////////////////////
// Command Line Arguments
/////////////////////////////
2011-08-18 22:57:48 +08:00
/ * *
* After the header , data records occur one per line until the end of the file . The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime . The last
* three items are the data - that is , number of observations for this combination of covariates , number of reference mismatches ,
* and the raw empirical quality score calculated by phred - scaling the mismatch rate .
* /
2010-08-10 00:42:48 +08:00
@Input ( fullName = "recal_file" , shortName = "recalFile" , doc = "The input recal csv file to analyze" , required = false )
2009-12-12 05:57:18 +08:00
private String RECAL_FILE = "output.recal_data.csv" ;
@Argument ( fullName = "output_dir" , shortName = "outputDir" , doc = "The directory in which to output all the plots and intermediate data files" , required = false )
private String OUTPUT_DIR = "analyzeCovariates/" ;
2010-06-19 01:37:11 +08:00
@Argument ( fullName = "path_to_Rscript" , shortName = "Rscript" , doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript" , required = false )
private String PATH_TO_RSCRIPT = "Rscript" ;
2009-12-12 05:57:18 +08:00
@Argument ( fullName = "path_to_resources" , shortName = "resources" , doc = "Path to resources folder holding the Sting R scripts." , required = false )
2011-07-06 22:01:14 +08:00
private String PATH_TO_RESOURCES = "public/R/" ;
2009-12-12 05:57:18 +08:00
@Argument ( fullName = "ignoreQ" , shortName = "ignoreQ" , doc = "Ignore bases with reported quality less than this number." , required = false )
private int IGNORE_QSCORES_LESS_THAN = 5 ;
2011-08-28 00:54:13 +08:00
@Argument ( fullName = "numRG" , shortName = "numRG" , doc = "Only process N read groups. Default value: -1 (process all read groups)" , required = false )
2009-12-12 05:57:18 +08:00
private int NUM_READ_GROUPS_TO_PROCESS = - 1 ; // -1 means process all read groups
2011-08-18 22:57:48 +08:00
/ * *
* Combinations of covariates in which there are zero mismatches technically have infinite quality . We get around this situation
* by capping at the specified value . We ' ve found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later .
* /
2010-11-25 05:34:14 +08:00
@Argument ( fullName = "max_quality_score" , shortName = "maxQ" , required = false , doc = "The integer value at which to cap the quality scores, default is 50" )
2010-11-25 05:32:26 +08:00
private int MAX_QUALITY_SCORE = 50 ;
2011-08-18 22:57:48 +08:00
/ * *
* This argument is useful for comparing before / after plots and you want the axes to match each other .
* /
2010-02-01 07:13:57 +08:00
@Argument ( fullName = "max_histogram_value" , shortName = "maxHist" , required = false , doc = "If supplied, this value will be the max value of the histogram plots" )
private int MAX_HISTOGRAM_VALUE = 0 ;
2011-08-18 22:57:48 +08:00
@Argument ( fullName = "do_indel_quality" , shortName = "indels" , required = false , doc = "If supplied, do indel quality plotting" )
2011-01-22 22:09:23 +08:00
private boolean DO_INDEL_QUALITY = false ;
2010-01-07 21:50:30 +08:00
2009-12-03 00:47:35 +08:00
/////////////////////////////
// Private Member Variables
/////////////////////////////
2009-12-12 05:57:18 +08:00
private AnalysisDataManager dataManager ; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList < Covariate > requestedCovariates ; // List of covariates to be used in this calculation
private final Pattern COMMENT_PATTERN = Pattern . compile ( "^#.*" ) ;
private final Pattern OLD_RECALIBRATOR_HEADER = Pattern . compile ( "^rg,.*" ) ;
private final Pattern COVARIATE_PATTERN = Pattern . compile ( "^ReadGroup,QualityScore,.*" ) ;
2010-07-14 00:09:48 +08:00
protected static final String EOF_MARKER = "EOF" ;
2009-12-03 00:47:35 +08:00
2009-12-12 05:57:18 +08:00
protected int execute ( ) {
2009-12-03 00:47:35 +08:00
// create the output directory where all the data tables and plots will go
try {
Process p = Runtime . getRuntime ( ) . exec ( "mkdir " + OUTPUT_DIR ) ;
} catch ( IOException e ) {
2010-03-17 03:45:02 +08:00
System . out . println ( "Couldn't create directory: " + OUTPUT_DIR ) ;
System . out . println ( "User is responsible for making sure the output directory exists." ) ;
2009-12-03 00:47:35 +08:00
}
if ( ! OUTPUT_DIR . endsWith ( "/" ) ) { OUTPUT_DIR = OUTPUT_DIR + "/" ; }
if ( ! PATH_TO_RESOURCES . endsWith ( "/" ) ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/" ; }
// initialize all the data from the csv file and allocate the list of covariates
System . out . println ( "Reading in input csv file..." ) ;
initializeData ( ) ;
System . out . println ( "...Done!" ) ;
// output data tables for Rscript to read in
System . out . println ( "Writing out intermediate tables for R..." ) ;
writeDataTables ( ) ;
System . out . println ( "...Done!" ) ;
// perform the analysis using Rscript and output the plots
System . out . println ( "Calling analysis R scripts and writing out figures..." ) ;
callRScripts ( ) ;
System . out . println ( "...Done!" ) ;
2011-04-01 04:50:38 +08:00
return 0 ;
2009-12-03 00:47:35 +08:00
}
2009-12-12 05:57:18 +08:00
private void initializeData ( ) {
2009-12-03 00:47:35 +08:00
// Get a list of all available covariates
2010-11-13 04:14:28 +08:00
Collection < Class < ? extends Covariate > > classes = new PluginManager < Covariate > ( Covariate . class ) . getPlugins ( ) ;
2009-12-03 00:47:35 +08:00
int lineNumber = 0 ;
boolean foundAllCovariates = false ;
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList < Covariate > ( ) ;
try {
2010-04-20 07:00:08 +08:00
for ( String line : new XReadLines ( new File ( RECAL_FILE ) ) ) {
2009-12-03 00:47:35 +08:00
lineNumber + + ;
2010-07-14 00:09:48 +08:00
if ( COMMENT_PATTERN . matcher ( line ) . matches ( ) | | OLD_RECALIBRATOR_HEADER . matcher ( line ) . matches ( ) | | line . equals ( EOF_MARKER ) ) {
2009-12-03 00:47:35 +08:00
; // Skip over the comment lines, (which start with '#')
}
else if ( COVARIATE_PATTERN . matcher ( line ) . matches ( ) ) { // The line string is either specifying a covariate or is giving csv data
if ( foundAllCovariates ) {
throw new RuntimeException ( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ) ;
} else { // Found the covariate list in input file, loop through all of them and instantiate them
String [ ] vals = line . split ( "," ) ;
for ( int iii = 0 ; iii < vals . length - 3 ; iii + + ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
boolean foundClass = false ;
for ( Class < ? > covClass : classes ) {
if ( ( vals [ iii ] + "Covariate" ) . equalsIgnoreCase ( covClass . getSimpleName ( ) ) ) {
foundClass = true ;
try {
Covariate covariate = ( Covariate ) covClass . newInstance ( ) ;
requestedCovariates . add ( covariate ) ;
2010-09-10 23:25:30 +08:00
} catch ( Exception e ) {
throw new DynamicClassResolutionException ( covClass , e ) ;
2009-12-03 00:47:35 +08:00
}
}
}
if ( ! foundClass ) {
throw new RuntimeException ( "Malformed input recalibration file. The requested covariate type (" + ( vals [ iii ] + "Covariate" ) + ") isn't a valid covariate option." ) ;
}
}
}
} else { // Found a line of data
if ( ! foundAllCovariates ) {
foundAllCovariates = true ;
// At this point all the covariates should have been found and initialized
if ( requestedCovariates . size ( ) < 2 ) {
throw new RuntimeException ( "Malformed input recalibration file. Covariate names can't be found in file: " + RECAL_FILE ) ;
}
// Initialize any covariate member variables using the shared argument collection
for ( Covariate cov : requestedCovariates ) {
cov . initialize ( new RecalibrationArgumentCollection ( ) ) ;
}
// Initialize the data hashMaps
dataManager = new AnalysisDataManager ( requestedCovariates . size ( ) ) ;
}
addCSVData ( line ) ; // Parse the line and add the data to the HashMap
}
}
} catch ( FileNotFoundException e ) {
throw new RuntimeException ( "Can not find input file: " + RECAL_FILE ) ;
} catch ( NumberFormatException e ) {
throw new RuntimeException ( "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker." ) ;
}
}
2009-12-12 05:57:18 +08:00
private void addCSVData ( String line ) {
2009-12-03 00:47:35 +08:00
String [ ] vals = line . split ( "," ) ;
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
if ( vals . length ! = requestedCovariates . size ( ) + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
throw new RuntimeException ( "Malformed input recalibration file. Found data line with too many fields: " + line +
" --Perhaps the read group string contains a comma and isn't being parsed correctly." ) ;
}
2009-12-31 02:10:10 +08:00
Object [ ] key = new Object [ requestedCovariates . size ( ) ] ;
2009-12-03 00:47:35 +08:00
Covariate cov ;
int iii ;
for ( iii = 0 ; iii < requestedCovariates . size ( ) ; iii + + ) {
cov = requestedCovariates . get ( iii ) ;
2009-12-31 02:10:10 +08:00
key [ iii ] = cov . getValue ( vals [ iii ] ) ;
2009-12-03 00:47:35 +08:00
}
// Create a new datum using the number of observations, number of mismatches, and reported quality score
2009-12-30 05:01:32 +08:00
RecalDatum datum = new RecalDatum ( Long . parseLong ( vals [ iii ] ) , Long . parseLong ( vals [ iii + 1 ] ) , Double . parseDouble ( vals [ 1 ] ) , 0.0 ) ;
2009-12-03 00:47:35 +08:00
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager . addToAllTables ( key , datum , IGNORE_QSCORES_LESS_THAN ) ;
}
2009-12-12 05:57:18 +08:00
private void writeDataTables ( ) {
2009-12-03 00:47:35 +08:00
int numReadGroups = 0 ;
// for each read group
2009-12-31 02:10:10 +08:00
for ( Object readGroupKey : dataManager . getCollapsedTable ( 0 ) . data . keySet ( ) ) {
2009-12-03 00:47:35 +08:00
if ( NUM_READ_GROUPS_TO_PROCESS = = - 1 | | + + numReadGroups < = NUM_READ_GROUPS_TO_PROCESS ) {
2009-12-31 02:10:10 +08:00
String readGroup = readGroupKey . toString ( ) ;
RecalDatum readGroupDatum = ( RecalDatum ) dataManager . getCollapsedTable ( 0 ) . data . get ( readGroupKey ) ;
2009-12-03 00:47:35 +08:00
System . out . print ( "Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum . getNumObservations ( ) + " observations" ) ;
2010-01-07 21:50:30 +08:00
System . out . println ( "\tand aggregate residual error = " + String . format ( "%.3f" , readGroupDatum . empiricalQualDouble ( 0 , MAX_QUALITY_SCORE ) - readGroupDatum . getEstimatedQReported ( ) ) ) ;
2009-12-03 00:47:35 +08:00
// for each covariate
for ( int iii = 1 ; iii < requestedCovariates . size ( ) ; iii + + ) {
Covariate cov = requestedCovariates . get ( iii ) ;
// Create a PrintStream
PrintStream output = null ;
try {
output = new PrintStream ( new FileOutputStream ( OUTPUT_DIR + readGroup + "." + cov . getClass ( ) . getSimpleName ( ) + ".dat" ) ) ;
} catch ( FileNotFoundException e ) {
System . err . println ( "Can't create file: " + OUTPUT_DIR + readGroup + "." + cov . getClass ( ) . getSimpleName ( ) + ".dat" ) ;
System . exit ( - 1 ) ;
}
// Output the header
output . println ( "Covariate\tQreported\tQempirical\tnMismatches\tnBases" ) ;
2009-12-31 02:10:10 +08:00
for ( Object covariateKey : ( ( Map ) dataManager . getCollapsedTable ( iii ) . data . get ( readGroupKey ) ) . keySet ( ) ) {
2010-01-07 21:50:30 +08:00
output . print ( covariateKey . toString ( ) + "\t" ) ; // Covariate
2009-12-31 02:10:10 +08:00
RecalDatum thisDatum = ( RecalDatum ) ( ( Map ) dataManager . getCollapsedTable ( iii ) . data . get ( readGroupKey ) ) . get ( covariateKey ) ;
2010-01-07 21:50:30 +08:00
output . print ( String . format ( "%.3f" , thisDatum . getEstimatedQReported ( ) ) + "\t" ) ; // Qreported
output . print ( String . format ( "%.3f" , thisDatum . empiricalQualDouble ( 0 , MAX_QUALITY_SCORE ) ) + "\t" ) ; // Qempirical
output . print ( thisDatum . getNumMismatches ( ) + "\t" ) ; // nMismatches
output . println ( thisDatum . getNumObservations ( ) ) ; // nBases
2009-12-03 00:47:35 +08:00
}
// Close the PrintStream
output . close ( ) ;
}
} else {
break ;
}
}
}
2009-12-12 05:57:18 +08:00
private void callRScripts ( ) {
2011-08-28 00:54:13 +08:00
RScriptExecutor . RScriptArgumentCollection argumentCollection =
new RScriptExecutor . RScriptArgumentCollection ( PATH_TO_RSCRIPT , Arrays . asList ( PATH_TO_RESOURCES ) ) ;
RScriptExecutor executor = new RScriptExecutor ( argumentCollection , true ) ;
2009-12-03 00:47:35 +08:00
int numReadGroups = 0 ;
2011-08-28 00:54:13 +08:00
2009-12-03 00:47:35 +08:00
// for each read group
2009-12-31 02:10:10 +08:00
for ( Object readGroupKey : dataManager . getCollapsedTable ( 0 ) . data . keySet ( ) ) {
2010-01-06 22:56:29 +08:00
if ( + + numReadGroups < = NUM_READ_GROUPS_TO_PROCESS | | NUM_READ_GROUPS_TO_PROCESS = = - 1 ) {
2009-12-03 00:47:35 +08:00
2009-12-31 02:10:10 +08:00
String readGroup = readGroupKey . toString ( ) ;
2009-12-03 00:47:35 +08:00
System . out . println ( "Analyzing read group: " + readGroup ) ;
// for each covariate
for ( int iii = 1 ; iii < requestedCovariates . size ( ) ; iii + + ) {
Covariate cov = requestedCovariates . get ( iii ) ;
2011-08-28 00:54:13 +08:00
final String outputFilename = OUTPUT_DIR + readGroup + "." + cov . getClass ( ) . getSimpleName ( ) + ".dat" ;
if ( DO_INDEL_QUALITY ) {
executor . callRScripts ( "plot_indelQuality.R" , outputFilename ,
cov . getClass ( ) . getSimpleName ( ) . split ( "Covariate" ) [ 0 ] ) ; // The third argument is the name of the covariate in order to make the plots look nice
} else {
2009-12-03 00:47:35 +08:00
if ( iii = = 1 ) {
2011-08-28 00:54:13 +08:00
// Analyze reported quality
executor . callRScripts ( "plot_residualError_QualityScoreCovariate.R" , outputFilename ,
IGNORE_QSCORES_LESS_THAN , MAX_QUALITY_SCORE , MAX_HISTOGRAM_VALUE ) ; // The third argument is the Q scores that should be turned pink in the plot because they were ignored
} else { // Analyze all other covariates
executor . callRScripts ( "plot_residualError_OtherCovariate.R" , outputFilename ,
cov . getClass ( ) . getSimpleName ( ) . split ( "Covariate" ) [ 0 ] ) ; // The third argument is the name of the covariate in order to make the plots look nice
2009-12-03 00:47:35 +08:00
}
}
}
2010-01-06 22:56:29 +08:00
} else { // at the maximum number of read groups so break out
2009-12-03 00:47:35 +08:00
break ;
}
}
}
2009-12-12 05:57:18 +08:00
2011-03-24 22:03:51 +08:00
public static void main ( String args [ ] ) {
try {
AnalyzeCovariates clp = new AnalyzeCovariates ( ) ;
start ( clp , args ) ;
System . exit ( CommandLineProgram . result ) ;
} catch ( Exception e ) {
exitSystemWithError ( e ) ;
}
2009-12-12 05:57:18 +08:00
}
2009-12-29 05:26:00 +08:00
}