From 7c4ce6d9696e02c87ac37824d6885d666b19c98e Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 18 Aug 2011 14:00:39 -0400 Subject: [PATCH 2/2] Added GATKDocs for the VQSR walkers. --- .../analyzecovariates/AnalyzeCovariates.java | 1 - .../TableRecalibrationWalker.java | 2 +- .../ApplyRecalibration.java | 47 +++++++++-- .../VariantRecalibrator.java | 82 +++++++++++++++---- ...VariantRecalibratorArgumentCollection.java | 8 +- 5 files changed, 112 insertions(+), 28 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index 9b316f077..2ff8aa979 100755 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.analyzecovariates; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index af7148803..85166d30d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -52,7 +52,7 @@ import java.util.ResourceBundle; import java.util.regex.Pattern; /** - * Second pass of the recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. + * Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. * *

* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index abe27e483..16f1abf1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -45,10 +45,43 @@ import java.io.FileNotFoundException; import java.util.*; /** - * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration + * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration + * + *

+ * Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value + * and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level + * have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered + * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a + * slightly lower quality level. + * + *

+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration + * + *

Input

+ *

+ * The input raw variants to be recalibrated. + *

+ * The recalibration table file in CSV format that was generated by the VariantRecalibrator walker. + *

+ * The tranches file that was generated by the VariantRecalibrator walker. + * + *

Output

+ *

+ * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level. + * + *

Examples

+ *
+ * java -Xmx3g -jar GenomeAnalysisTK.jar \
+ *   -T ApplyRecalibration \
+ *   -R reference/human_g1k_v37.fasta \
+ *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
+ *   --ts_filter_level 99.0 \
+ *   -tranchesFile path/to/output.tranches \
+ *   -recalFile path/to/output.recal \
+ *   -o path/to/output.recalibrated.filtered.vcf
+ * 
* - * @author rpoplin - * @since Mar 14, 2011 */ public class ApplyRecalibration extends RodWalker { @@ -57,11 +90,11 @@ public class ApplyRecalibration extends RodWalker { // Inputs ///////////////////////////// /** - * The raw input variants to be recalibrated. + * These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling. */ @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) public List> input; - @Input(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true) + @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true) private File RECAL_FILE; @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true) private File TRANCHES_FILE; @@ -69,7 +102,7 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// // Outputs ///////////////////////////// - @Output( doc="The output filtered, recalibrated VCF file", required=true) + @Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true) private VCFWriter vcfWriter = null; ///////////////////////////// @@ -77,7 +110,7 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) private double TS_FILTER_LEVEL = 99.0; - @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false) + @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false) private String[] IGNORE_INPUT_FILTERS = null; @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false) public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index da9da936b..d81a57aad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -45,10 +45,54 @@ import java.io.PrintStream; import java.util.*; /** - * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score + * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. + * + *

+ * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker. + * + *

+ * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. + * One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. + * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship + * between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic + * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided + * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive + * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the + * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is + * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. + * + *

+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration + * + *

Input

+ *

+ * The input raw variants to be recalibrated. + *

+ * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. + * + *

Output

+ *

+ * A recalibration table file in CSV format that is used by the ApplyRecalibration walker. + *

+ * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. + * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -T VariantRecalibrator \
+ *   -R reference/human_g1k_v37.fasta \
+ *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
+ *   -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -known:prior=8.0 dbsnp_132.b37.vcf \
+ *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
+ *   -recalFile path/to/output.recal \
+ *   -tranchesFile path/to/output.tranches \
+ *   -rscriptFile path/to/output.plots.R
+ * 
* - * User: rpoplin - * Date: 3/12/11 */ public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { @@ -62,36 +106,32 @@ public class VariantRecalibrator extends RodWalker> input; + /** - * A list of training variants used to train the Gaussian mixture model. - * * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model. */ @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true) public List> training; + /** - * A list of true variants to be used when deciding the truth sensitivity cut of the final callset. - * * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used. * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example. */ @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true) public List> truth; + /** - * A list of known variants to be used for metric comparison purposes. - * * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes. * The output metrics are stratified by known status in order to aid in comparisons with other call sets. */ @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false) public List> known = Collections.emptyList(); + /** - * A list of known bad variants used to supplement training the negative model. - * * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example). */ @@ -109,13 +149,25 @@ public class VariantRecalibrator extends RodWalker