2009-12-03 00:47:35 +08:00
package org.broadinstitute.sting.analyzecovariates ;
import org.broadinstitute.sting.gatk.walkers.recalibration.* ;
import org.broadinstitute.sting.utils.PackageUtils ;
import org.broadinstitute.sting.utils.xReadLines ;
import java.util.ArrayList ;
import java.util.List ;
import java.util.regex.Pattern ;
import java.io.* ;
/ * *
* Created by IntelliJ IDEA .
* User : rpoplin
* Date : Dec 1 , 2009
* /
public class AnalyzeCovariates {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
private static String RECAL_FILE = "output.recal_data.csv" ;
private static String OUTPUT_DIR = "analyzeCovariates/" ;
private static String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript" ;
private static String PATH_TO_RESOURCES = "R/" ;
private static int IGNORE_QSCORES_LESS_THAN = 5 ;
private static int NUM_READ_GROUPS_TO_PROCESS = - 1 ; // -1 means process all read groups
/////////////////////////////
// Private Member Variables
/////////////////////////////
private static AnalysisDataManager dataManager ; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private static ArrayList < Covariate > requestedCovariates ; // List of covariates to be used in this calculation
private static final Pattern COMMENT_PATTERN = Pattern . compile ( "^#.*" ) ;
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern . compile ( "^rg,.*" ) ;
private static final Pattern COVARIATE_PATTERN = Pattern . compile ( "^ReadGroup,QualityScore,.*" ) ;
public static void main ( String [ ] args ) {
// parse command line arguments
parseArguments ( args ) ;
// create the output directory where all the data tables and plots will go
try {
Process p = Runtime . getRuntime ( ) . exec ( "mkdir " + OUTPUT_DIR ) ;
} catch ( IOException e ) {
throw new RuntimeException ( "Couldn't create directory: " + OUTPUT_DIR ) ;
}
if ( ! OUTPUT_DIR . endsWith ( "/" ) ) { OUTPUT_DIR = OUTPUT_DIR + "/" ; }
if ( ! PATH_TO_RESOURCES . endsWith ( "/" ) ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/" ; }
// initialize all the data from the csv file and allocate the list of covariates
System . out . println ( "Reading in input csv file..." ) ;
initializeData ( ) ;
System . out . println ( "...Done!" ) ;
// output data tables for Rscript to read in
System . out . println ( "Writing out intermediate tables for R..." ) ;
writeDataTables ( ) ;
System . out . println ( "...Done!" ) ;
// perform the analysis using Rscript and output the plots
System . out . println ( "Calling analysis R scripts and writing out figures..." ) ;
callRScripts ( ) ;
System . out . println ( "...Done!" ) ;
}
private static void parseArguments ( String [ ] args ) {
int iii = 0 ;
String arg ;
try {
while ( iii < args . length & & args [ iii ] . startsWith ( "-" ) ) {
arg = args [ iii + + ] ;
if ( arg . equals ( "-recalFile" ) ) {
RECAL_FILE = args [ iii + + ] ;
} else if ( arg . equals ( "-Rscript" ) ) {
PATH_TO_RSCRIPT = args [ iii + + ] ;
} else if ( arg . equals ( "-resources" ) ) {
PATH_TO_RESOURCES = args [ iii + + ] ;
} else if ( arg . equals ( "-ignoreQ" ) ) {
IGNORE_QSCORES_LESS_THAN = Integer . parseInt ( args [ iii + + ] ) ;
} else if ( arg . equals ( "-numRG" ) ) {
NUM_READ_GROUPS_TO_PROCESS = Integer . parseInt ( args [ iii + + ] ) ;
} else if ( arg . equals ( "-outputDir" ) ) {
OUTPUT_DIR = args [ iii + + ] ;
} else {
iii = - 1 ;
break ;
}
}
if ( iii ! = args . length ) {
throw new RuntimeException ( "Exception" ) ;
}
} catch ( Exception e ) {
System . out . println ( "Usage: [-option param] \n" ) ;
System . out . println ( " Available options:" ) ;
System . out . println ( "\t-recalFile <path>\tPath to input recal csv file. Default value: output.recal_data.csv" ) ;
System . out . println ( "\t-Rscript <path>\t\tPath to your implementation of Rscript. Default value: /broad/tools/apps/R-2.6.0/bin/Rscript" ) ;
System . out . println ( "\t-resources <path>\tPath to resources folder holding the Sting R scripts. Default value: R/" ) ;
System . out . println ( "\t-outputDir <path>\tWhere to put the output plots. Default value: analyzeCovariates/" ) ;
System . out . println ( "\t-ignoreQ <int>\t\tIgnore bases with reported quality less than this number. Default value: 5" ) ;
System . out . println ( "\t-numRG <int>\t\tOnly process N read groups. Default value: -1 (process all read groups)" ) ;
System . exit ( - 1 ) ;
}
}
private static void initializeData ( ) {
// Get a list of all available covariates
List < Class < ? extends Covariate > > classes = PackageUtils . getClassesImplementingInterface ( Covariate . class ) ;
int lineNumber = 0 ;
boolean foundAllCovariates = false ;
int estimatedCapacity = 1 ; // Capacity is multiplicitive so this starts at one
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList < Covariate > ( ) ;
try {
for ( String line : new xReadLines ( new File ( RECAL_FILE ) ) ) {
lineNumber + + ;
if ( COMMENT_PATTERN . matcher ( line ) . matches ( ) | | OLD_RECALIBRATOR_HEADER . matcher ( line ) . matches ( ) ) {
; // Skip over the comment lines, (which start with '#')
}
else if ( COVARIATE_PATTERN . matcher ( line ) . matches ( ) ) { // The line string is either specifying a covariate or is giving csv data
if ( foundAllCovariates ) {
throw new RuntimeException ( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ) ;
} else { // Found the covariate list in input file, loop through all of them and instantiate them
String [ ] vals = line . split ( "," ) ;
for ( int iii = 0 ; iii < vals . length - 3 ; iii + + ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
boolean foundClass = false ;
for ( Class < ? > covClass : classes ) {
if ( ( vals [ iii ] + "Covariate" ) . equalsIgnoreCase ( covClass . getSimpleName ( ) ) ) {
foundClass = true ;
try {
Covariate covariate = ( Covariate ) covClass . newInstance ( ) ;
requestedCovariates . add ( covariate ) ;
estimatedCapacity * = covariate . estimatedNumberOfBins ( ) ;
} catch ( InstantiationException e ) {
throw new RuntimeException ( String . format ( "Can not instantiate covariate class '%s': must be concrete class." , covClass . getSimpleName ( ) ) ) ;
} catch ( IllegalAccessException e ) {
throw new RuntimeException ( String . format ( "Can not instantiate covariate class '%s': must have no-arg constructor." , covClass . getSimpleName ( ) ) ) ;
}
}
}
if ( ! foundClass ) {
throw new RuntimeException ( "Malformed input recalibration file. The requested covariate type (" + ( vals [ iii ] + "Covariate" ) + ") isn't a valid covariate option." ) ;
}
}
}
} else { // Found a line of data
if ( ! foundAllCovariates ) {
foundAllCovariates = true ;
// At this point all the covariates should have been found and initialized
if ( requestedCovariates . size ( ) < 2 ) {
throw new RuntimeException ( "Malformed input recalibration file. Covariate names can't be found in file: " + RECAL_FILE ) ;
}
// Don't want to crash with out of heap space exception
if ( estimatedCapacity > 300 * 40 * 200 | | estimatedCapacity < 0 ) { // Could be negative if overflowed
estimatedCapacity = 300 * 40 * 200 ;
}
// Initialize any covariate member variables using the shared argument collection
for ( Covariate cov : requestedCovariates ) {
cov . initialize ( new RecalibrationArgumentCollection ( ) ) ;
}
// Initialize the data hashMaps
dataManager = new AnalysisDataManager ( requestedCovariates . size ( ) ) ;
}
addCSVData ( line ) ; // Parse the line and add the data to the HashMap
}
}
} catch ( FileNotFoundException e ) {
throw new RuntimeException ( "Can not find input file: " + RECAL_FILE ) ;
} catch ( NumberFormatException e ) {
throw new RuntimeException ( "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker." ) ;
}
}
private static void addCSVData ( String line ) {
String [ ] vals = line . split ( "," ) ;
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
if ( vals . length ! = requestedCovariates . size ( ) + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
throw new RuntimeException ( "Malformed input recalibration file. Found data line with too many fields: " + line +
" --Perhaps the read group string contains a comma and isn't being parsed correctly." ) ;
}
ArrayList < Comparable > key = new ArrayList < Comparable > ( ) ;
Covariate cov ;
int iii ;
for ( iii = 0 ; iii < requestedCovariates . size ( ) ; iii + + ) {
cov = requestedCovariates . get ( iii ) ;
key . add ( cov . getValue ( vals [ iii ] ) ) ;
}
// Create a new datum using the number of observations, number of mismatches, and reported quality score
RecalDatum datum = new RecalDatum ( Long . parseLong ( vals [ iii ] ) , Long . parseLong ( vals [ iii + 1 ] ) , Double . parseDouble ( vals [ 1 ] ) ) ;
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager . addToAllTables ( key , datum , IGNORE_QSCORES_LESS_THAN ) ;
}
private static void writeDataTables ( ) {
int numReadGroups = 0 ;
// for each read group
NHashMap < RecalDatum > readGroupTable = dataManager . getCollapsedTable ( 0 ) ;
for ( List < ? extends Comparable > readGroupKey : readGroupTable . keySet ( ) ) {
if ( NUM_READ_GROUPS_TO_PROCESS = = - 1 | | + + numReadGroups < = NUM_READ_GROUPS_TO_PROCESS ) {
String readGroup = readGroupKey . get ( 0 ) . toString ( ) ;
RecalDatum readGroupDatum = readGroupTable . get ( readGroupKey ) ;
System . out . print ( "Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum . getNumObservations ( ) + " observations" ) ;
System . out . println ( "\tand aggregate residual error = " + String . format ( "%.3f" , readGroupDatum . empiricalQualDouble ( 0 ) - readGroupDatum . getEstimatedQReported ( ) ) ) ;
// for each covariate
for ( int iii = 1 ; iii < requestedCovariates . size ( ) ; iii + + ) {
Covariate cov = requestedCovariates . get ( iii ) ;
// Create a PrintStream
PrintStream output = null ;
try {
output = new PrintStream ( new FileOutputStream ( OUTPUT_DIR + readGroup + "." + cov . getClass ( ) . getSimpleName ( ) + ".dat" ) ) ;
} catch ( FileNotFoundException e ) {
System . err . println ( "Can't create file: " + OUTPUT_DIR + readGroup + "." + cov . getClass ( ) . getSimpleName ( ) + ".dat" ) ;
System . exit ( - 1 ) ;
}
// Output the header
output . println ( "Covariate\tQreported\tQempirical\tnMismatches\tnBases" ) ;
// Loop through the covariate table looking for keys with matching read groups
// BUGBUG: hopefully rewrite this to be more efficient
for ( List < ? extends Comparable > covariateKey : dataManager . getCollapsedTable ( iii ) . keySet ( ) ) {
if ( covariateKey . get ( 0 ) . toString ( ) . equals ( readGroup ) ) {
output . print ( covariateKey . get ( 1 ) . toString ( ) + "\t" ) ; // Covariate
RecalDatum thisDatum = dataManager . getCollapsedTable ( iii ) . get ( covariateKey ) ;
output . print ( String . format ( "%.3f" , thisDatum . getEstimatedQReported ( ) ) + "\t" ) ; // Qreported
output . print ( String . format ( "%.3f" , thisDatum . empiricalQualDouble ( 0 ) ) + "\t" ) ; // Qempirical
output . print ( thisDatum . getNumMismatches ( ) + "\t" ) ; // nMismatches
output . println ( thisDatum . getNumObservations ( ) ) ; // nBases
}
}
// Close the PrintStream
output . close ( ) ;
}
} else {
break ;
}
}
}
private static void callRScripts ( ) {
int numReadGroups = 0 ;
// for each read group
for ( List < ? extends Comparable > readGroupList : dataManager . getCollapsedTable ( 0 ) . keySet ( ) ) {
if ( NUM_READ_GROUPS_TO_PROCESS = = - 1 | | + + numReadGroups < = NUM_READ_GROUPS_TO_PROCESS ) {
String readGroup = readGroupList . get ( 0 ) . toString ( ) ;
System . out . println ( "Analyzing read group: " + readGroup ) ;
// for each covariate
for ( int iii = 1 ; iii < requestedCovariates . size ( ) ; iii + + ) {
Covariate cov = requestedCovariates . get ( iii ) ;
try {
if ( iii = = 1 ) {
// Analyze reported quality
Process p = Runtime . getRuntime ( ) . exec ( PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov . getClass ( ) . getSimpleName ( ) + ".dat" + " " +
IGNORE_QSCORES_LESS_THAN ) ; // The third argument is the Q scores that should be turned pink in the plot because they were ignored
} else { // Analyze all other covariates
Process p = Runtime . getRuntime ( ) . exec ( PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov . getClass ( ) . getSimpleName ( ) + ".dat" + " " +
2009-12-03 07:15:52 +08:00
cov . getClass ( ) . getSimpleName ( ) . split ( "Covariate" ) [ 0 ] ) ; // The third argument is the name of the covariate in order to make the plots look nice
2009-12-03 00:47:35 +08:00
}
} catch ( IOException e ) {
e . printStackTrace ( ) ;
System . exit ( - 1 ) ;
}
}
} else {
break ;
}
}
}
}