Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2011-08-18 11:30:59 -04:00
commit aa21fc7c9c
7 changed files with 241 additions and 47 deletions

View File

@ -28,11 +28,13 @@ package org.broadinstitute.sting.analyzecovariates;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.*;
@ -42,19 +44,71 @@ import java.util.Map;
import java.util.regex.Pattern;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Dec 1, 2009
* Call R scripts to plot residual error versus the various covariates.
*
* Create collapsed versions of the recal csv file and call R scripts to plot residual error versus the various covariates.
* <p>
* After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which
* reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically
* show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities).
* In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run
* CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated
* by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual
* error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated
* bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration.
*
* <p>
* The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into
* each of the quality score estimates. It is defined as follows for N, the number of observations:
*
* <ul>
* <li>light blue means N < 1,000</li>
* <li>cornflower blue means 1,000 <= N < 10,000</li>
* <li>dark blue means N >= 10,000</li>
* <li>The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically
* meaningless and so aren't included in any of the numerical calculations.</li>
* </ul>
*
* <p>
* NOTE: For those running this tool externally from the Broad, it is crucial to note that both the -Rscript and -resources options
* must be changed from the default. -Rscript needs to point to your installation of Rscript (this is the scripting version of R,
* not the interactive version) while -resources needs to point to the folder holding the R scripts that are used. For those using
* this tool as part of the Binary Distribution the -resources should point to the resources folder that is part of the tarball.
* For those using this tool by building from the git repository the -resources should point to the R/ subdirectory of the Sting checkout.
*
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* The recalibration table file in CSV format that was generated by the CountCovariates walker.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar AnalyzeCovariates.jar \
* -recalFile /path/to/recal.table.csv \
* -outputDir /path/to/output_dir/ \
* -resources resources/ \
* -ignoreQ 5
* </pre>
*
*/
@DocumentedGATKFeature(
groupName = "AnalyzeCovariates",
summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
public class AnalyzeCovariates extends CommandLineProgram {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/
@Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false)
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
@ -67,11 +121,20 @@ public class AnalyzeCovariates extends CommandLineProgram {
private int IGNORE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
/**
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
*/
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 50")
private int MAX_QUALITY_SCORE = 50;
/**
* This argument is useful for comparing before/after plots and you want the axes to match each other.
*/
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
private int MAX_HISTOGRAM_VALUE = 0;
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, this value will be the max value of the histogram plots")
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting")
private boolean DO_INDEL_QUALITY = false;

View File

@ -0,0 +1,4 @@
/**
* Package to plot residual accuracy versus error covariates for the base quality score recalibrator.
*/
package org.broadinstitute.sting.analyzecovariates;

View File

@ -1,3 +1,28 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.commandline.Gatherer;
@ -12,11 +37,8 @@ import java.util.List;
import java.util.regex.Pattern;
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 3/29/11
* Time: 3:54 PM
* To change this template use File | Settings | File Templates.
*/

View File

@ -50,22 +50,47 @@ import java.util.List;
import java.util.Map;
/**
* First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
*
* This walker is designed to work as the first pass in a two-pass processing step.
* It does a by-locus traversal operating only at sites that are not in dbSNP.
* We assume that all reference mismatches we see are therefore errors and indicative of poor base quality.
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide)
* Since there is a large amount of data one can then calculate an empirical probability of error
* given the particular covariates seen at this site, where p(error) = num mismatches / num observations
* The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score)
* The first non-comment line of the output file gives the name of the covariates that were used for this calculation.
* <p>
* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
* only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
* of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
* reported quality score, cycle, and dinucleotide). Since there is a large amount of data one can then calculate an empirical
* probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
* The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score).
* <p>
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
*
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified
* Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* A database of known polymorphic sites to skip over.
* </p>
*
* <h2>Output</h2>
* <p>
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -R resources/Homo_sapiens_assembly18.fasta \
* -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
* -knownSites another/optional/setOfSitesToMask.vcf \
* -I my_reads.bam \
* -T CountCovariates \
* -cov ReadGroupCovariate \
* -cov QualityScoreCovariate \
* -cov CycleCovariate \
* -cov DinucCovariate \
* -recalFile my_reads.recal_data.csv
* </pre>
*
* @author rpoplin
* @since Nov 3, 2009
*/
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@ -96,8 +121,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
*/
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
@Output
PrintStream out;
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
@Gather(CountCovariatesGatherer.class)
public PrintStream RECAL_FILE;
@ -114,6 +144,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/////////////////////////////
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
private boolean DONT_SORT_OUTPUT = false;
/**
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
*/
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
private boolean RUN_WITHOUT_DBSNP = false;
@ -178,11 +212,11 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
// Print and exit if that's what was requested
if ( LIST_ONLY ) {
out.println( "Available covariates:" );
logger.info( "Available covariates:" );
for( Class<?> covClass : covariateClasses ) {
out.println( covClass.getSimpleName() );
logger.info( covClass.getSimpleName() );
}
out.println();
logger.info("");
System.exit( 0 ); // Early exit here because user requested it
}

View File

@ -66,15 +66,22 @@ public class RecalDataManager {
private static boolean warnUserNullPlatform = false;
public enum SOLID_RECAL_MODE {
/** Treat reference inserted bases as reference matching bases. Very unsafe! */
DO_NOTHING,
/** Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. */
SET_Q_ZERO,
/** In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. */
SET_Q_ZERO_BASE_N,
/** Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. */
REMOVE_REF_BIAS
}
public enum SOLID_NOCALL_STRATEGY {
/** When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. */
THROW_EXCEPTION,
/** Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. */
LEAVE_READ_UNRECALIBRATED,
/** Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. */
PURGE_READ
}

View File

@ -51,12 +51,27 @@ public class RecalibrationArgumentCollection {
public String FORCE_PLATFORM = null;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
public int WINDOW_SIZE = 5;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
public int HOMOPOLYMER_NBACK = 7;
@Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false)
public boolean EXCEPTION_IF_NO_TILE = false;
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
*/
@Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO;
/**
* CountCovariates and TableRecalibration accept a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
* their color space tag can not be recalibrated.
*/
@Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false)
public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
}

View File

@ -52,19 +52,38 @@ import java.util.ResourceBundle;
import java.util.regex.Pattern;
/**
* Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
* Second pass of the recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
*
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.
* <p>
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each
* base in each read this walker calculates various user-specified covariates (such as read group, reported quality score,
* cycle, and dinuc). Using these values as a key in a large hashmap the walker calculates an empirical base quality score
* and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads.
*
* For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
* Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read.
* This walker then outputs a new bam file with these updated (recalibrated) reads.
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
*
* Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker.
* Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
* <h2>Input</h2>
* <p>
* The recalibration table file in CSV format that was generated by the CountCovariates walker.
* </p>
*
* <h2>Output</h2>
* <p>
* A bam file in which the quality scores in each read have been recalibrated.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -R resources/Homo_sapiens_assembly18.fasta \
* -I my_reads.bam \
* -T TableRecalibration \
* -o my_reads.recal.bam \
* -recalFile my_reads.recal_data.csv
* </pre>
*
* @author rpoplin
* @since Nov 3, 2009
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@ -79,24 +98,54 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
@Input(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file")
public File RECAL_FILE = new File("output.recal_data.csv");
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="output_bam", shortName="outputBam", doc="Please use --out instead", required=false)
@Deprecated
protected String outbam;
@Output(doc="The output BAM file", required=true)
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/
@Input(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the input covariates table recalibration .csv file")
public File RECAL_FILE = null;
/**
* A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
*/
@Output(doc="The output recalibrated BAM file", required=true)
private StingSAMFileWriter OUTPUT_BAM = null;
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
/**
* TableRacalibration accepts a --preserve_qscores_less_than / -pQ <Q> flag that instructs TableRecalibration to not modify
* quality scores less than <Q> but rather just write them out unmodified in the recalibrated BAM file. This is useful
* because Solexa writes Q2 and Q3 bases when the machine has really gone wrong. This would be fine in and of itself,
* but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect,
* your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
* are unmodified during recalibration, so they don't get inappropriately evaluated.
*/
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
private int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1")
/**
* By default TableRecalibration applies a Yates' correction to account for overfitting when it calculates the empirical
* quality score, in particular, ( # mismatches + 1 ) / ( # observations + 1 ). TableRecalibration accepts a --smoothing / -sm <int>
* argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
* --smoothing 15 for a large amount of smoothing.
*/
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
private int SMOOTHING = 1;
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default=50")
/**
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
*/
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores")
private int MAX_QUALITY_SCORE = 50;
/**
* By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
* the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
*/
@Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read")
private boolean DO_NOT_WRITE_OQ = false;