Merge branch 'master' into help

2011-08-18 14:05:33 -04:00 · 2011-08-18 14:05:33 -04:00 · f2f51e35e3
parent ce009bd4a4 7c4ce6d969
commit f2f51e35e3
5 changed files with 112 additions and 28 deletions
--- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
+++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
@ -28,7 +28,6 @@ package org.broadinstitute.sting.analyzecovariates;
 import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.commandline.CommandLineProgram;
 import org.broadinstitute.sting.commandline.Input;
-import org.broadinstitute.sting.gatk.CommandLineGATK;
 import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
 import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
 import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
@ -52,7 +52,7 @@ import java.util.ResourceBundle;
 import java.util.regex.Pattern;

 /**
- * Second pass of the recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
+ * Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
 *
 * <p>
 * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
@ -45,10 +45,43 @@ import java.io.FileNotFoundException;
 import java.util.*;

 /**
- * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
+ * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration
+ *
+ * <p>
+ * Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value
+ * and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level
+ * have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered
+ * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
+ * slightly lower quality level.
+ *
+ * <p>
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots.
+ * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
+ *
+ * <h2>Input</h2>
+ * <p>
+ * The input raw variants to be recalibrated.
+ * <p>
+ * The recalibration table file in CSV format that was generated by the VariantRecalibrator walker.
+ * <p>
+ * The tranches file that was generated by the VariantRecalibrator walker.
+ *
+ * <h2>Output</h2>
+ * <p>
+ * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level.
+ *
+ * <h2>Examples</h2>
+ * <pre>
+ * java -Xmx3g -jar GenomeAnalysisTK.jar \
+ *   -T ApplyRecalibration \
+ *   -R reference/human_g1k_v37.fasta \
+ *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
+ *   --ts_filter_level 99.0 \
+ *   -tranchesFile path/to/output.tranches \
+ *   -recalFile path/to/output.recal \
+ *   -o path/to/output.recalibrated.filtered.vcf
+ * </pre>
 *
- * @author rpoplin
- * @since Mar 14, 2011
 */

 public class ApplyRecalibration extends RodWalker<Integer, Integer> {
@ -57,11 +90,11 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
    // Inputs
    /////////////////////////////
    /**
-     * The raw input variants to be recalibrated.
+     * These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
     */
    @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
    public List<RodBinding<VariantContext>> input;
-    @Input(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true)
+    @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true)
    private File RECAL_FILE;
    @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
    private File TRANCHES_FILE;
@ -69,7 +102,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
    /////////////////////////////
    // Outputs
    /////////////////////////////
-    @Output( doc="The output filtered, recalibrated VCF file", required=true)
+    @Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true)
    private VCFWriter vcfWriter = null;

    /////////////////////////////
@ -77,7 +110,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
    /////////////////////////////
    @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
    private double TS_FILTER_LEVEL = 99.0;
-    @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
+    @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
    private String[] IGNORE_INPUT_FILTERS = null;
    @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false)
    public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@ -45,10 +45,54 @@ import java.io.PrintStream;
 import java.util.*;

 /**
- * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score
+ * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants.
+ *
+ * <p>
+ * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker.
+ *
+ * <p>
+ * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set.
+ * One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call.
+ * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship
+ * between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic
+ * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided
+ * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive
+ * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the
+ * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is
+ * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.
+ *
+ * <p>
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots.
+ * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
+ *
+ * <h2>Input</h2>
+ * <p>
+ * The input raw variants to be recalibrated.
+ * <p>
+ * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below.
+ *
+ * <h2>Output</h2>
+ * <p>
+ * A recalibration table file in CSV format that is used by the ApplyRecalibration walker.
+ * <p>
+ * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.  
+ *
+ * <h2>Examples</h2>
+ * <pre>
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -T VariantRecalibrator \
+ *   -R reference/human_g1k_v37.fasta \
+ *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
+ *   -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -known:prior=8.0 dbsnp_132.b37.vcf \
+ *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
+ *   -recalFile path/to/output.recal \
+ *   -tranchesFile path/to/output.tranches \
+ *   -rscriptFile path/to/output.plots.R
+ * </pre>
 *
- * User: rpoplin
- * Date: 3/12/11
 */

 public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
@ -62,36 +106,32 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    // Inputs
    /////////////////////////////
    /**
-     * The raw input variants to be recalibrated.
+     * These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
     */
    @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
    public List<RodBinding<VariantContext>> input;
+
    /**
-     * A list of training variants used to train the Gaussian mixture model.
-     *
     * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
     */
    @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
    public List<RodBinding<VariantContext>> training;
+
    /**
-     * A list of true variants to be used when deciding the truth sensitivity cut of the final callset.
-     *
     * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
     * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
     */
    @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
    public List<RodBinding<VariantContext>> truth;
+
    /**
-     * A list of known variants to be used for metric comparison purposes.
-     *
     * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
     * The output metrics are stratified by known status in order to aid in comparisons with other call sets.
     */
    @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
    public List<RodBinding<VariantContext>> known = Collections.emptyList();
+
    /**
-     * A list of known bad variants used to supplement training the negative model.
-     *
     * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
     * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
     */
@ -109,13 +149,25 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    /////////////////////////////
    // Additional Command Line Arguments
    /////////////////////////////
-    @Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
+    /**
+     * The expected transition / tranversion ratio of true novel variants in your targeted region (whole genome, exome, specific
+     * genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best
+     * practices wiki documentation for more information. Normal whole genome values are 2.15 and for whole exome 3.2. Note
+     * that this parameter is used for display purposes only and isn't used anywhere in the algorithm!
+     */
+    @Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
    private double TARGET_TITV = 2.15;
    @Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
    private String[] USE_ANNOTATIONS = null;
+
+    /**
+     * Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0
+     * which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible
+     * sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%.
+     */
    @Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
    private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0};
-    @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
+    @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
    private String[] IGNORE_INPUT_FILTERS = null;
    @Argument(fullName="path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
    private String PATH_TO_RSCRIPT = "Rscript";
@ -123,7 +175,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    private String RSCRIPT_FILE = null;
    @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
    private String PATH_TO_RESOURCES = "public/R/";
-    @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in plots", required=false)
+    @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in the model reporting plots", required=false)
    private double TS_FILTER_LEVEL = 99.0;

    /////////////////////////////
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
@ -53,14 +53,14 @@ public class VariantRecalibratorArgumentCollection {
    public double STD_THRESHOLD = 14.0;
    @Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.", required=false)
    public double QUAL_THRESHOLD = 80.0;
-    @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
+    @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false)
    public double SHRINKAGE = 1.0;
-    @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
+    @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false)
    public double DIRICHLET_PARAMETER = 0.001;
-    @Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false)
+    @Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false)
    public double PRIOR_COUNTS = 20.0;
    @Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
    public double PERCENT_BAD_VARIANTS = 0.03;
-    @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
+    @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad argument if necessary.", required=false)
    public int MIN_NUM_BAD_VARIANTS = 2000;
 }