From 67179e2412233cf48f70629b7c7c2858ee8499e4 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Wed, 2 Dec 2009 16:47:35 +0000 Subject: [PATCH] Initial checkin of AnalyzeCovariates.java which replaces analyzeRecalQuals_1KG.py and is updated to use the new Covariates system. It creates similar plots of residual error for each covariate that was used in the calculation. There is also an option to filter out base qualities below a given threshold. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2215 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_residualError_OtherCovariate.R | 55 ++++ R/plot_residualError_QualityScoreCovariate.R | 55 ++++ {R => archive/R}/plot_q_emp_stated_hst.R | 0 {R => archive/R}/plot_qual_diff_v_cycle.R | 0 {R => archive/R}/plot_qual_diff_v_dinuc.R | 0 .../python}/analyzeRecalQuals.py | 0 .../python}/analyzeRecalQuals_1KG.py | 0 build.xml | 9 +- .../AnalysisDataManager.java | 83 +++++ .../analyzecovariates/AnalyzeCovariates.java | 307 ++++++++++++++++++ .../recalibration/RecalDataManager.java | 1 + .../TableRecalibrationWalker.java | 8 +- 12 files changed, 510 insertions(+), 8 deletions(-) create mode 100644 R/plot_residualError_OtherCovariate.R create mode 100644 R/plot_residualError_QualityScoreCovariate.R rename {R => archive/R}/plot_q_emp_stated_hst.R (100%) rename {R => archive/R}/plot_qual_diff_v_cycle.R (100%) rename {R => archive/R}/plot_qual_diff_v_dinuc.R (100%) rename {python => archive/python}/analyzeRecalQuals.py (100%) rename {python => archive/python}/analyzeRecalQuals_1KG.py (100%) create mode 100755 java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java create mode 100755 java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java diff --git a/R/plot_residualError_OtherCovariate.R b/R/plot_residualError_OtherCovariate.R new file mode 100644 index 000000000..7d7cd2df9 --- /dev/null +++ b/R/plot_residualError_OtherCovariate.R @@ -0,0 +1,55 @@ +#!/broad/tools/apps/R-2.6.0/bin/Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +covariateName = args[2] + +#X11(width=7, height=14) +#outfile = paste(input, ".qual_diff_v_cycle.png", sep="") +#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680) +outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +c <- read.table(input, header=T) +c <- c[sort.list(c[,1]),] + +d.good <- c[c$nBases >= 1000,] +d.1000 <- c[c$nBases < 1000,] +rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) ) +rmseAll = sqrt(sum((c$Qempirical-c$Qreported)^2 * c$nBases) / sum(c$nBases) ) +theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) +if( length(d.good$nBases) == length(c$nBases) ) { + theTitle = paste("RMSE = ", round(rmseAll,digits=3)) + } +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$Qempirical-d.good$Qreported, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=16, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="p", col="cornflowerblue", pch=16) +} else { + plot(c$Covariate, c$Qempirical-c$Qreported, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10)) + points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="l", col="cornflowerblue") +} +dev.off() +#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16) + +# +# Plot histogram of the covariate +# +e = d.good +f = d.1000 +outfile = paste(input, ".", covariateName,"_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0) +if( is.numeric(c$Covariate) ) { + plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=4, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n") + points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=4, col="cornflowerblue") + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} else { + hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0) + plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=4, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xaxt="n") + axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)]) + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} +dev.off() \ No newline at end of file diff --git a/R/plot_residualError_QualityScoreCovariate.R b/R/plot_residualError_QualityScoreCovariate.R new file mode 100644 index 000000000..278b57efd --- /dev/null +++ b/R/plot_residualError_QualityScoreCovariate.R @@ -0,0 +1,55 @@ +#!/broad/tools/apps/R-2.6.0/bin/Rscript + +args <- commandArgs(TRUE) + +input = args[1] +Qcutoff = as.numeric(args[2]) + +t=read.table(input, header=T) +#t=read.csv(input) +#par(mfrow=c(2,1), cex=1.2) + +#outfile = paste(input, ".quality_emp_v_stated.png", sep="") +#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446) +outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") +pdf(outfile, height=7, width=7) +d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,] +d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,] +d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,] +f <- t[t$Qreported < Qcutoff,] +e <- rbind(d.good, d.1000, d.10000) +rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) ) +rmseAll = sqrt(sum((e$Qempirical-e$Qreported)^2 * e$nBases) / sum(e$nBases) ) +theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) +if (length(t$nBases) == length(d.good$nBases) ) { + theTitle = paste("RMSE = ", round(rmseAll,digits=3)); + } +plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score") +points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) +points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16) +points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16) +abline(0,1, lty=2) +dev.off() + +#outfile = paste(input, ".quality_emp_hist.png", sep="") +#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446) +outfile = paste(input, ".quality_emp_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0) +plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count",yaxt="n") +points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() + +# +# Plot Q reported histogram +# +outfile = paste(input, ".quality_rep_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0) +plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), main="Reported quality score histogram", xlab="Reported quality score", ylab="Count",yaxt="n") +points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() diff --git a/R/plot_q_emp_stated_hst.R b/archive/R/plot_q_emp_stated_hst.R similarity index 100% rename from R/plot_q_emp_stated_hst.R rename to archive/R/plot_q_emp_stated_hst.R diff --git a/R/plot_qual_diff_v_cycle.R b/archive/R/plot_qual_diff_v_cycle.R similarity index 100% rename from R/plot_qual_diff_v_cycle.R rename to archive/R/plot_qual_diff_v_cycle.R diff --git a/R/plot_qual_diff_v_dinuc.R b/archive/R/plot_qual_diff_v_dinuc.R similarity index 100% rename from R/plot_qual_diff_v_dinuc.R rename to archive/R/plot_qual_diff_v_dinuc.R diff --git a/python/analyzeRecalQuals.py b/archive/python/analyzeRecalQuals.py similarity index 100% rename from python/analyzeRecalQuals.py rename to archive/python/analyzeRecalQuals.py diff --git a/python/analyzeRecalQuals_1KG.py b/archive/python/analyzeRecalQuals_1KG.py similarity index 100% rename from python/analyzeRecalQuals_1KG.py rename to archive/python/analyzeRecalQuals_1KG.py diff --git a/build.xml b/build.xml index ef2121a3f..9c9a9c10d 100644 --- a/build.xml +++ b/build.xml @@ -114,12 +114,13 @@ - + - + + - + @@ -150,7 +151,7 @@ - + diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java b/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java new file mode 100755 index 000000000..3c5e12ebd --- /dev/null +++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java @@ -0,0 +1,83 @@ +package org.broadinstitute.sting.analyzecovariates; + +import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; +import org.broadinstitute.sting.gatk.walkers.recalibration.NHashMap; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Dec 1, 2009 + */ + +public class AnalysisDataManager { + + private NHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed + private ArrayList> dataCollapsedByCovariate; // Tables where everything except read group and given covariate has been collapsed + + AnalysisDataManager() { + } + + AnalysisDataManager( final int numCovariates ) { + dataCollapsedReadGroup = new NHashMap(); + dataCollapsedByCovariate = new ArrayList>(); + for( int iii = 0; iii < numCovariates - 1; iii++ ) { // readGroup isn't counted here, its table is separate + dataCollapsedByCovariate.add( new NHashMap() ); + } + } + + /** + * Add the given mapping to all of the collapsed hash tables + * @param key The list of comparables that is the key for this mapping + * @param fullDatum The RecalDatum which is the data for this mapping + */ + public final void addToAllTables( final List key, final RecalDatum fullDatum, final int IGNORE_QSCORES_LESS_THAN ) { + + int qscore = Integer.parseInt( key.get(1).toString() ); + ArrayList newKey; + RecalDatum collapsedDatum; + + if( !(qscore < IGNORE_QSCORES_LESS_THAN) ) { + // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed + newKey = new ArrayList(); + newKey.add( key.get(0) ); // Make a new key with just the read group + collapsedDatum = dataCollapsedReadGroup.get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported + } + } + + // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed + for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { + if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle etc. + newKey = new ArrayList(); + newKey.add( key.get(0) ); // Make a new key with the read group ... + newKey.add( key.get(iii + 1) ); // and the given covariate + collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.combine( fullDatum ); + } + } + } + } + + /** + * Get the appropriate collapsed table out of the set of all the tables held by this Object + * @param covariate Which covariate indexes the desired collapsed HashMap + * @return The desired collapsed HashMap + */ + public final NHashMap getCollapsedTable( final int covariate ) { + if( covariate == 0) { + return dataCollapsedReadGroup; // Table where everything except read group has been collapsed + } else { + return dataCollapsedByCovariate.get( covariate - 1 ); // Table where everything except read group, quality score, and given covariate has been collapsed + } + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java new file mode 100755 index 000000000..b6eeebc0c --- /dev/null +++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -0,0 +1,307 @@ +package org.broadinstitute.sting.analyzecovariates; + +import org.broadinstitute.sting.gatk.walkers.recalibration.*; +import org.broadinstitute.sting.utils.PackageUtils; +import org.broadinstitute.sting.utils.xReadLines; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; +import java.io.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Dec 1, 2009 + */ + +public class AnalyzeCovariates { + + ///////////////////////////// + // Command Line Arguments + ///////////////////////////// + private static String RECAL_FILE = "output.recal_data.csv"; + private static String OUTPUT_DIR = "analyzeCovariates/"; + private static String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; + private static String PATH_TO_RESOURCES = "R/"; + private static int IGNORE_QSCORES_LESS_THAN = 5; + private static int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private static AnalysisDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + private static ArrayList requestedCovariates; // List of covariates to be used in this calculation + private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); + private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + + public static void main(String[] args) { + + // parse command line arguments + parseArguments( args ); + // create the output directory where all the data tables and plots will go + try { + Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR); + } catch (IOException e) { + throw new RuntimeException("Couldn't create directory: " + OUTPUT_DIR); + } + if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; } + if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; } + + // initialize all the data from the csv file and allocate the list of covariates + System.out.println("Reading in input csv file..."); + initializeData(); + System.out.println("...Done!"); + + // output data tables for Rscript to read in + System.out.println("Writing out intermediate tables for R..."); + writeDataTables(); + System.out.println("...Done!"); + + // perform the analysis using Rscript and output the plots + System.out.println("Calling analysis R scripts and writing out figures..."); + callRScripts(); + System.out.println("...Done!"); + + } + + private static void parseArguments( String[] args ) { + int iii = 0; + String arg; + + try { + while( iii < args.length && args[iii].startsWith("-") ) { + arg = args[iii++]; + + if( arg.equals( "-recalFile" ) ) { + RECAL_FILE = args[iii++]; + } else if( arg.equals( "-Rscript" ) ) { + PATH_TO_RSCRIPT = args[iii++]; + } else if( arg.equals( "-resources" ) ) { + PATH_TO_RESOURCES = args[iii++]; + } else if( arg.equals( "-ignoreQ" ) ) { + IGNORE_QSCORES_LESS_THAN = Integer.parseInt( args[iii++] ); + } else if (arg.equals( "-numRG" ) ) { + NUM_READ_GROUPS_TO_PROCESS = Integer.parseInt( args[iii++] ); + } else if( arg.equals( "-outputDir" ) ) { + OUTPUT_DIR = args[iii++]; + } else { + iii = -1; + break; + } + } + + if( iii != args.length ) { + throw new RuntimeException( "Exception" ); + } + } catch(Exception e) { + System.out.println( "Usage: [-option param] \n" ); + System.out.println(" Available options:"); + System.out.println("\t-recalFile \tPath to input recal csv file. Default value: output.recal_data.csv"); + System.out.println("\t-Rscript \t\tPath to your implementation of Rscript. Default value: /broad/tools/apps/R-2.6.0/bin/Rscript"); + System.out.println("\t-resources \tPath to resources folder holding the Sting R scripts. Default value: R/"); + System.out.println("\t-outputDir \tWhere to put the output plots. Default value: analyzeCovariates/"); + System.out.println("\t-ignoreQ \t\tIgnore bases with reported quality less than this number. Default value: 5"); + System.out.println("\t-numRG \t\tOnly process N read groups. Default value: -1 (process all read groups)"); + System.exit(-1); + } + } + + private static void initializeData() { + + // Get a list of all available covariates + List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); + + int lineNumber = 0; + boolean foundAllCovariates = false; + int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one + + // Read in the covariates that were used from the input file + requestedCovariates = new ArrayList(); + + try { + for ( String line : new xReadLines(new File( RECAL_FILE )) ) { + lineNumber++; + if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) { + ; // Skip over the comment lines, (which start with '#') + } + else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data + if( foundAllCovariates ) { + throw new RuntimeException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); + } else { // Found the covariate list in input file, loop through all of them and instantiate them + String[] vals = line.split(","); + for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical + boolean foundClass = false; + for( Class covClass : classes ) { + if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { + foundClass = true; + try { + Covariate covariate = (Covariate)covClass.newInstance(); + requestedCovariates.add( covariate ); + estimatedCapacity *= covariate.estimatedNumberOfBins(); + + } catch ( InstantiationException e ) { + throw new RuntimeException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); + } catch ( IllegalAccessException e ) { + throw new RuntimeException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); + } + } + } + + if( !foundClass ) { + throw new RuntimeException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); + } + } + + } + + } else { // Found a line of data + if( !foundAllCovariates ) { + + foundAllCovariates = true; + + // At this point all the covariates should have been found and initialized + if( requestedCovariates.size() < 2 ) { + throw new RuntimeException( "Malformed input recalibration file. Covariate names can't be found in file: " + RECAL_FILE ); + } + + // Don't want to crash with out of heap space exception + if( estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0 ) { // Could be negative if overflowed + estimatedCapacity = 300 * 40 * 200; + } + + // Initialize any covariate member variables using the shared argument collection + for( Covariate cov : requestedCovariates ) { + cov.initialize( new RecalibrationArgumentCollection() ); + } + + // Initialize the data hashMaps + dataManager = new AnalysisDataManager( requestedCovariates.size() ); + + } + addCSVData(line); // Parse the line and add the data to the HashMap + } + } + + } catch ( FileNotFoundException e ) { + throw new RuntimeException("Can not find input file: " + RECAL_FILE); + } catch ( NumberFormatException e ) { + throw new RuntimeException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); + } + } + + private static void addCSVData(String line) { + String[] vals = line.split(","); + + // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly + if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical + throw new RuntimeException("Malformed input recalibration file. Found data line with too many fields: " + line + + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); + } + + ArrayList key = new ArrayList(); + Covariate cov; + int iii; + for( iii = 0; iii < requestedCovariates.size(); iii++ ) { + cov = requestedCovariates.get( iii ); + key.add( cov.getValue( vals[iii] ) ); + } + // Create a new datum using the number of observations, number of mismatches, and reported quality score + RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) ); + // Add that datum to all the collapsed tables which will be used in the sequential calculation + dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN ); + + } + + private static void writeDataTables() { + + int numReadGroups = 0; + + // for each read group + NHashMap readGroupTable = dataManager.getCollapsedTable(0); + for( List readGroupKey : readGroupTable.keySet() ) { + + if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) { + String readGroup = readGroupKey.get(0).toString(); + RecalDatum readGroupDatum = readGroupTable.get(readGroupKey); + System.out.print("Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum.getNumObservations() + " observations" ); + System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0) - readGroupDatum.getEstimatedQReported())); + + // for each covariate + for( int iii = 1; iii < requestedCovariates.size(); iii++ ) { + Covariate cov = requestedCovariates.get(iii); + + // Create a PrintStream + PrintStream output = null; + try { + output = new PrintStream(new FileOutputStream(OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat")); + + } catch (FileNotFoundException e) { + System.err.println("Can't create file: " + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat"); + System.exit(-1); + } + + // Output the header + output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases"); + + // Loop through the covariate table looking for keys with matching read groups + // BUGBUG: hopefully rewrite this to be more efficient + for( List covariateKey : dataManager.getCollapsedTable(iii).keySet() ) { + if( covariateKey.get(0).toString().equals(readGroup) ) { + output.print( covariateKey.get(1).toString() + "\t" ); // Covariate + RecalDatum thisDatum = dataManager.getCollapsedTable(iii).get(covariateKey); + output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported + output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0)) + "\t" ); // Qempirical + output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches + output.println( thisDatum.getNumObservations() ); // nBases + } + } + + // Close the PrintStream + output.close(); + } + } else { + break; + } + + } + } + + private static void callRScripts() { + + int numReadGroups = 0; + + // for each read group + for( List readGroupList : dataManager.getCollapsedTable(0).keySet() ) { + + if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) { + + String readGroup = readGroupList.get(0).toString(); + System.out.println("Analyzing read group: " + readGroup); + + // for each covariate + for( int iii = 1; iii < requestedCovariates.size(); iii++ ) { + Covariate cov = requestedCovariates.get(iii); + try { + if( iii == 1 ) { + // Analyze reported quality + Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " + + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " + + IGNORE_QSCORES_LESS_THAN); // The third argument is the Q scores that should be turned pink in the plot because they were ignored + } else { // Analyze all other covariates + Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " + + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " + + cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument which is the name of the covariate to make the plots look nice + } + } catch (IOException e) { + e.printStackTrace(); + System.exit(-1); + } + } + } else { + break; + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 2b804e7c4..2e74067e4 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -44,6 +44,7 @@ import net.sf.samtools.SAMReadGroupRecord; */ public class RecalDataManager { + public NHashMap data; // The full dataset private NHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed private NHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 8702cb85f..d972d1ddf 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -170,7 +170,7 @@ public class TableRecalibrationWalker extends ReadWalker