Updated VQSR tool docs

2014-12-16 17:26:31 -05:00 · 2014-12-16 17:26:31 -05:00 · b0e615251b
parent 4a2ac38308
commit b0e615251b
2 changed files with 71 additions and 33 deletions
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java
@ -74,28 +74,44 @@ import java.io.File;
 import java.util.*;

 /**
- * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration
+ * Apply a score cutoff to filter variants based on a recalibration table
 *
 * <p>
- * Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value
+ * This tool performs the second pass in a two-stage process called VQSR; the first pass is performed by the
+ * <a href='https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantrecalibration_VariantRecalibrator.php'>VariantRecalibrator</a> tool.
+ * In brief, the first pass consists of creating a Gaussian mixture model by looking at the distribution of annotation
+ * values over a high quality subset of the input call set, and then scoring all input variants according to the model.
+ * The second pass consists of filtering variants based on score cutoffs identified in the first pass.
+ *</p>
+ *
+ * <p>
+ * Using the tranche file and recalibration table generated by the previous step, the ApplyRecalibration tool looks at each variant's VQSLOD value
 * and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level
- * have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered
- * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
- * slightly lower quality level.
+ * have their FILTER field annotated with the corresponding tranche level. This will result in a call set that is filtered
+ * to the desired level but retains the information necessary to increase sensitivity if needed.</p>
+ *
+ * <p>To be clear, please note that by "filtered", we mean that variants failing the requested tranche cutoff are <b>marked
+ * as filtered</b> in the output VCF; they are <b>not discarded</b>.</p>
+ *
+ * <p>VQSR is probably the hardest part of the Best Practices to get right, so be sure to read the
+ * <a href='https://www.broadinstitute.org/gatk/guide/article?id=39'>method documentation</a>,
+ * <a href='https://www.broadinstitute.org/gatk/guide/article?id=1259'>parameter recommendations</a> and
+ * <a href='https://www.broadinstitute.org/gatk/guide/article?id=2805'>tutorial</a> to really understand what these
+ * tools and how to use them for best results on your own data.</p>
 *
 * <h3>Input</h3>
- * <p>
- * The input raw variants to be recalibrated.
- * <p>
- * The recalibration table file in VCF format that was generated by the VariantRecalibrator walker.
- * <p>
- * The tranches file that was generated by the VariantRecalibrator walker.
+ * <ul>
+ * <li>The raw input variants to be filtered.</li>
+ * <li>The recalibration table file that was generated by the VariantRecalibrator tool.</li>
+ * <li>The tranches file that was generated by the VariantRecalibrator tool.</li>
+ * </ul>
 *
 * <h3>Output</h3>
- * <p>
- * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level.
+ * <ul>
+ * <li>A recalibrated VCF file in which each variant of the requested type is annotated with its VQSLOD and marked as filtered if the score is below the desired quality level.</li>
+ * </ul>
 *
- * <h3>Examples</h3>
+ * <h3>Example for filtering SNPs</h3>
 * <pre>
 * java -Xmx3g -jar GenomeAnalysisTK.jar \
 *   -T ApplyRecalibration \
@ -108,6 +124,16 @@ import java.util.*;
 *   -o path/to/output.recalibrated.filtered.vcf
 * </pre>
 *
+ * <h3>Caveats</h3>
+ *
+ * <ul>
+ * <li>The tranche values used in the example above is only a general example. You should determine the level of sensitivity
+ * that is appropriate for your specific project. Remember that higher sensitivity (more power to detect variants, yay!) comes
+ * at the cost of specificity (more false negatives, boo!). You have to choose at what point you want to set the tradeoff.</li>
+ * <li>In order to create the tranche reporting plots (which are only generated for SNPs, not indels!) Rscript needs to be
+ * in your environment PATH (this is the scripting version of R, not the interactive version).
+ * See <a target="r-project" href="http://www.r-project.org">http://www.r-project.org</a> for more info on how to download and install R.</li>
+ * </ul>
 */

@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} )
@ -139,7 +165,6 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
    /////////////////////////////
    // Command Line Arguments
    /////////////////////////////
-    @Advanced
    @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
    protected Double TS_FILTER_LEVEL = null;
    @Advanced
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java
@ -80,37 +80,47 @@ import java.io.PrintStream;
 import java.util.*;

 /**
- * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants.
+ * Build a recalibration model to score variant quality for filtering purposes
 *
 * <p>
- * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker.
+ * This tool performs the first pass in a two-stage process called VQSR; the second pass is performed by the
+ * <a href='https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantrecalibration_ApplyRecalibration.php'>ApplyRecalibration</a> tool.
+ * In brief, the first pass consists of creating a Gaussian mixture model by looking at the distribution of annotation
+ * values over a high quality subset of the input call set, and then scoring all input variants according to the model.
+ * The second pass consists of filtering variants based on score cutoffs identified in the first pass.
 *</p>
 *
 * <p>
 * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set.
 * You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call.
 * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship
- * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic
+ * between SNP call annotations (such as QD, MQ, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic
 * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided
- * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive
+ * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array (in humans). This adaptive
 * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the
 * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is
 * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.
 * </p>
 *
+ * <p>VQSR is probably the hardest part of the Best Practices to get right, so be sure to read the
+ * <a href='https://www.broadinstitute.org/gatk/guide/article?id=39'>method documentation</a>,
+ * <a href='https://www.broadinstitute.org/gatk/guide/article?id=1259'>parameter recommendations</a> and
+ * <a href='https://www.broadinstitute.org/gatk/guide/article?id=2805'>tutorial</a> to really understand what these
+ * tools and how to use them for best results on your own data.</p>
+ *
 * <h3>Inputs</h3>
- * <p>
- * The input raw variants to be recalibrated.
- * <p>
- * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below.
+ * <ul>
+ * <li>The input raw variants to be recalibrated.</li>
+ * <li>Known, truth, and training sets to be used by the algorithm.</li>
+ * </ul>
 *
 * <h3>Output</h3>
- * <p>
- * A recalibration table file in VCF format that is used by the ApplyRecalibration walker.
- * <p>
- * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
+ * <ul>
+ * <li>A recalibration table file that will be used by the ApplyRecalibration tool.</li>
+ * <li>A tranches file which shows various metrics of the recalibration callset for slices of the data.</li>
+ * </ul>
 *
- * <h3>Example</h3>
+ * <h3>Example for recalibrating SNPs in exome data</h3>
 * <pre>
 * java -Xmx4g -jar GenomeAnalysisTK.jar \
 *   -T VariantRecalibrator \
@ -118,22 +128,25 @@ import java.util.*;
 *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
 *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
 *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf
 *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
- *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
+ *   -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an InbreedingCoeff \
 *   -mode SNP \
 *   -recalFile path/to/output.recal \
 *   -tranchesFile path/to/output.tranches \
 *   -rscriptFile path/to/output.plots.R
 * </pre>
 *
- * <h3>Caveat</h3>
+ * <h3>Caveats</h3>
 *
 * <ul>
 * <li>The values used in the example above are only meant to show how the command lines are composed.
 * They are not meant to be taken as specific recommendations of values to use in your own work, and they may be
 * different from the values cited elsewhere in our documentation. For the latest and greatest recommendations on
- * how to set parameter values for you own analyses, please read the Best Practices section of the documentation.</li>
- *
+ * how to set parameter values for you own analyses, please read the Best Practices section of the documentation,
+ * especially the <a href='https://www.broadinstitute.org/gatk/guide/article?id=1259'>FAQ document</a> on VQSR parameters.</li>
+ * <li>Whole genomes and exomes take slightly different parameters, so make sure you adapt your commands accordingly! See the documents linked above for details.</li>
+ * <li>If you work with small datasets (e.g. targeted capture experiments or small number of exomes), you will run into problems. Read the docs linked above for advice on how to deal with those issues.</li>
 * <li>In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version).
 * See <a target="r-project" href="http://www.r-project.org">http://www.r-project.org</a> for more info on how to download and install R.</li>
 * </ul>
@ -210,7 +223,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
     * which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible
     * sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%.
     */
-    @Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
+    @Argument(fullName="TStranche", shortName="tranche", doc="The levels of truth sensitivity at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
    private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0};
    /**
     * For this to work properly, the -ignoreFilter argument should also be applied to the ApplyRecalibration command.