Move AnalyzeCovariates over too.
This commit is contained in:
parent
0a89adbcdb
commit
a003148d50
|
|
@ -1,113 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.analyzecovariates;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Dec 1, 2009
|
||||
*
|
||||
* The difference between this AnalysisDataManager and the RecalDataManager used by the Recalibration walkers is that here the collapsed data tables are indexed
|
||||
* by only read group and the given covariate, while in the recalibrator the collapsed tables are indexed by read group, reported quality, and the given covariate.
|
||||
*/
|
||||
|
||||
public class AnalysisDataManager {
|
||||
|
||||
private NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private ArrayList<NestedHashMap> dataCollapsedByCovariate; // Tables where everything except read group and given covariate has been collapsed
|
||||
|
||||
AnalysisDataManager() {
|
||||
}
|
||||
|
||||
AnalysisDataManager( final int numCovariates ) {
|
||||
dataCollapsedReadGroup = new NestedHashMap();
|
||||
dataCollapsedByCovariate = new ArrayList<NestedHashMap>();
|
||||
for( int iii = 0; iii < numCovariates - 1; iii++ ) { // readGroup isn't counted here, its table is separate
|
||||
dataCollapsedByCovariate.add( new NestedHashMap() );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
* @param IGNORE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
|
||||
*/
|
||||
public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int IGNORE_QSCORES_LESS_THAN ) {
|
||||
|
||||
final int qscore = Integer.parseInt( key[1].toString() );
|
||||
RecalDatum collapsedDatum;
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] covariateCollapsedKey = new Object[2];
|
||||
|
||||
if( !(qscore < IGNORE_QSCORES_LESS_THAN) ) {
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
|
||||
collapsedDatum = (RecalDatum)dataCollapsedReadGroup.get( readGroupCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
}
|
||||
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) {
|
||||
if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle and etc.
|
||||
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
Object theCovariateElement = key[iii + 1]; // and the given covariate
|
||||
if( theCovariateElement != null ) {
|
||||
covariateCollapsedKey[1] = theCovariateElement;
|
||||
collapsedDatum = (RecalDatum)dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table out of the set of all the tables held by this Object
|
||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NestedHashMap getCollapsedTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariate.get( covariate - 1 ); // Table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,383 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.analyzecovariates;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.R.RScriptExecutor;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.io.Resource;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Call R scripts to plot residual error versus the various covariates.
|
||||
*
|
||||
* <p>
|
||||
* After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which
|
||||
* reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically
|
||||
* show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities).
|
||||
* In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run
|
||||
* CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated
|
||||
* by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual
|
||||
* error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated
|
||||
* bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration.
|
||||
*
|
||||
* <p>
|
||||
* The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into
|
||||
* each of the quality score estimates. It is defined as follows for N, the number of observations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>light blue means N < 1,000</li>
|
||||
* <li>cornflower blue means 1,000 <= N < 10,000</li>
|
||||
* <li>dark blue means N >= 10,000</li>
|
||||
* <li>The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically
|
||||
* meaningless and so aren't included in any of the numerical calculations.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* NOTE: Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version).
|
||||
* See <a target="r-project" href="http://www.r-project.org">http://www.r-project.org</a> for more info on how to download and install R.
|
||||
*
|
||||
* <p>
|
||||
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
|
||||
* <a target="gatkwiki" href="http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration"
|
||||
* >http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration</a>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The recalibration table file in CSV format that was generated by the CountCovariates walker.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar AnalyzeCovariates.jar \
|
||||
* -recalFile /path/to/recal.table.csv \
|
||||
* -outputDir /path/to/output_dir/ \
|
||||
* -ignoreQ 5
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@DocumentedGATKFeature(
|
||||
groupName = "AnalyzeCovariates",
|
||||
summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
|
||||
public class AnalyzeCovariates extends CommandLineProgram {
|
||||
final private static Logger logger = Logger.getLogger(AnalyzeCovariates.class);
|
||||
|
||||
private static final String PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE = "plot_residualError_QualityScoreCovariate.R";
|
||||
private static final String PLOT_RESDIUAL_ERROR_OTHER_COVARIATE = "plot_residualError_OtherCovariate.R";
|
||||
private static final String PLOT_INDEL_QUALITY_RSCRIPT = "plot_indelQuality.R";
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
/**
|
||||
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
|
||||
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
|
||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false)
|
||||
private String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
|
||||
private File OUTPUT_DIR = new File("analyzeCovariates");
|
||||
@Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false)
|
||||
private int IGNORE_QSCORES_LESS_THAN = 5;
|
||||
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
|
||||
private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
|
||||
|
||||
/**
|
||||
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
|
||||
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
|
||||
*/
|
||||
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 50")
|
||||
private int MAX_QUALITY_SCORE = 50;
|
||||
|
||||
/**
|
||||
* This argument is useful for comparing before/after plots and you want the axes to match each other.
|
||||
*/
|
||||
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
|
||||
private int MAX_HISTOGRAM_VALUE = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting")
|
||||
private boolean DO_INDEL_QUALITY = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private AnalysisDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
|
||||
private final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
protected static final String EOF_MARKER = "EOF";
|
||||
|
||||
protected int execute() {
|
||||
|
||||
// create the output directory where all the data tables and plots will go
|
||||
if (!OUTPUT_DIR.exists() && !OUTPUT_DIR.mkdirs())
|
||||
throw new UserException.BadArgumentValue("--output_dir/-outDir", "Unable to create output directory: " + OUTPUT_DIR);
|
||||
|
||||
if (!RScriptExecutor.RSCRIPT_EXISTS)
|
||||
Utils.warnUser(logger, "Rscript not found in environment path. Plots will not be generated.");
|
||||
|
||||
// initialize all the data from the csv file and allocate the list of covariates
|
||||
logger.info("Reading in input csv file...");
|
||||
initializeData();
|
||||
logger.info("...Done!");
|
||||
|
||||
// output data tables for Rscript to read in
|
||||
logger.info("Writing out intermediate tables for R...");
|
||||
writeDataTables();
|
||||
logger.info("...Done!");
|
||||
|
||||
// perform the analysis using Rscript and output the plots
|
||||
logger.info("Calling analysis R scripts and writing out figures...");
|
||||
callRScripts();
|
||||
logger.info("...Done!");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private void initializeData() {
|
||||
|
||||
// Get a list of all available covariates
|
||||
Collection<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
|
||||
// Read in the covariates that were used from the input file
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
|
||||
try {
|
||||
for ( final String line : new XReadLines(new File( RECAL_FILE )) ) {
|
||||
lineNumber++;
|
||||
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() || line.equals(EOF_MARKER) ) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new RuntimeException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new RuntimeException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new RuntimeException( "Malformed input recalibration file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
}
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( new RecalibrationArgumentCollection() );
|
||||
}
|
||||
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new AnalysisDataManager( requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException("Can not find input file: " + RECAL_FILE);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
}
|
||||
|
||||
private void addCSVData(String line) {
|
||||
String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new RuntimeException("Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
||||
Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
}
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN );
|
||||
}
|
||||
|
||||
private void writeDataTables() {
|
||||
|
||||
int numReadGroups = 0;
|
||||
|
||||
// for each read group
|
||||
for( final Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet() ) {
|
||||
|
||||
if( NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS ) {
|
||||
final String readGroup = readGroupKey.toString();
|
||||
final RecalDatum readGroupDatum = (RecalDatum) dataManager.getCollapsedTable(0).data.get(readGroupKey);
|
||||
logger.info(String.format(
|
||||
"Writing out data tables for read group: %s\twith %s observations\tand aggregate residual error = %.3f",
|
||||
readGroup, readGroupDatum.getNumObservations(),
|
||||
readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported()));
|
||||
|
||||
// for each covariate
|
||||
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
|
||||
Covariate cov = requestedCovariates.get(iii);
|
||||
|
||||
// Create a PrintStream
|
||||
File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
|
||||
PrintStream output;
|
||||
try {
|
||||
output = new PrintStream(FileUtils.openOutputStream(outputFile));
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
|
||||
}
|
||||
|
||||
try {
|
||||
// Output the header
|
||||
output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");
|
||||
|
||||
for( final Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet() ) {
|
||||
output.print( covariateKey.toString() + "\t" ); // Covariate
|
||||
final RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey);
|
||||
output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
|
||||
output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical
|
||||
output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
|
||||
output.println( thisDatum.getNumObservations() ); // nBases
|
||||
}
|
||||
} finally {
|
||||
// Close the PrintStream
|
||||
IOUtils.closeQuietly(output);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void callRScripts() {
|
||||
int numReadGroups = 0;
|
||||
|
||||
// for each read group
|
||||
for( Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet() ) {
|
||||
if(++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) {
|
||||
|
||||
String readGroup = readGroupKey.toString();
|
||||
logger.info("Analyzing read group: " + readGroup);
|
||||
|
||||
// for each covariate
|
||||
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
|
||||
final Covariate cov = requestedCovariates.get(iii);
|
||||
final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
|
||||
if (DO_INDEL_QUALITY) {
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(PLOT_INDEL_QUALITY_RSCRIPT, AnalyzeCovariates.class));
|
||||
// The second argument is the name of the covariate in order to make the plots look nice
|
||||
executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
|
||||
executor.exec();
|
||||
} else {
|
||||
if( iii == 1 ) {
|
||||
// Analyze reported quality
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE, AnalyzeCovariates.class));
|
||||
// The second argument is the Q scores that should be turned pink in the plot because they were ignored
|
||||
executor.addArgs(outputFile, IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE);
|
||||
executor.exec();
|
||||
} else { // Analyze all other covariates
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_OTHER_COVARIATE, AnalyzeCovariates.class));
|
||||
// The second argument is the name of the covariate in order to make the plots look nice
|
||||
executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
|
||||
executor.exec();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // at the maximum number of read groups so break out
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String args[]) {
|
||||
try {
|
||||
AnalyzeCovariates clp = new AnalyzeCovariates();
|
||||
start(clp, args);
|
||||
System.exit(CommandLineProgram.result);
|
||||
} catch (Exception e) {
|
||||
exitSystemWithError(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
/**
|
||||
* Package to plot residual accuracy versus error covariates for the base quality score recalibrator.
|
||||
*/
|
||||
package org.broadinstitute.sting.analyzecovariates;
|
||||
Loading…
Reference in New Issue