diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java index 8ade061c8..364fa1412 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java @@ -74,28 +74,44 @@ import java.io.File; import java.util.*; /** - * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration + * Apply a score cutoff to filter variants based on a recalibration table * *

- * Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value + * This tool performs the second pass in a two-stage process called VQSR; the first pass is performed by the + * VariantRecalibrator tool. + * In brief, the first pass consists of creating a Gaussian mixture model by looking at the distribution of annotation + * values over a high quality subset of the input call set, and then scoring all input variants according to the model. + * The second pass consists of filtering variants based on score cutoffs identified in the first pass. + *

+ * + *

+ * Using the tranche file and recalibration table generated by the previous step, the ApplyRecalibration tool looks at each variant's VQSLOD value * and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level - * have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered - * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a - * slightly lower quality level. + * have their FILTER field annotated with the corresponding tranche level. This will result in a call set that is filtered + * to the desired level but retains the information necessary to increase sensitivity if needed.

+ * + *

To be clear, please note that by "filtered", we mean that variants failing the requested tranche cutoff are marked + * as filtered in the output VCF; they are not discarded.

+ * + *

VQSR is probably the hardest part of the Best Practices to get right, so be sure to read the + * method documentation, + * parameter recommendations and + * tutorial to really understand what these + * tools and how to use them for best results on your own data.

* *

Input

- *

- * The input raw variants to be recalibrated. - *

- * The recalibration table file in VCF format that was generated by the VariantRecalibrator walker. - *

- * The tranches file that was generated by the VariantRecalibrator walker. + *

* *

Output

- *

- * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level. + *

* - *

Examples

+ *

Example for filtering SNPs

*
  * java -Xmx3g -jar GenomeAnalysisTK.jar \
  *   -T ApplyRecalibration \
@@ -108,6 +124,16 @@ import java.util.*;
  *   -o path/to/output.recalibrated.filtered.vcf
  * 
* + *

Caveats

+ * + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @@ -139,7 +165,6 @@ public class ApplyRecalibration extends RodWalker implements T ///////////////////////////// // Command Line Arguments ///////////////////////////// - @Advanced @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) protected Double TS_FILTER_LEVEL = null; @Advanced diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java index e5c7c248b..4a3a3535d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java @@ -80,37 +80,47 @@ import java.io.PrintStream; import java.util.*; /** - * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. + * Build a recalibration model to score variant quality for filtering purposes * *

- * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker. + * This tool performs the first pass in a two-stage process called VQSR; the second pass is performed by the + * ApplyRecalibration tool. + * In brief, the first pass consists of creating a Gaussian mixture model by looking at the distribution of annotation + * values over a high quality subset of the input call set, and then scoring all input variants according to the model. + * The second pass consists of filtering variants based on score cutoffs identified in the first pass. *

* *

* The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. * You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship - * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic + * between SNP call annotations (such as QD, MQ, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided - * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive + * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array (in humans). This adaptive * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. *

* + *

VQSR is probably the hardest part of the Best Practices to get right, so be sure to read the + * method documentation, + * parameter recommendations and + * tutorial to really understand what these + * tools and how to use them for best results on your own data.

+ * *

Inputs

- *

- * The input raw variants to be recalibrated. - *

- * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. + *

* *

Output

- *

- * A recalibration table file in VCF format that is used by the ApplyRecalibration walker. - *

- * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. + *

* - *

Example

+ *

Example for recalibrating SNPs in exome data

*
  * java -Xmx4g -jar GenomeAnalysisTK.jar \
  *   -T VariantRecalibrator \
@@ -118,22 +128,25 @@ import java.util.*;
  *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
  *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
  *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf
  *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
- *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
+ *   -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an InbreedingCoeff \
  *   -mode SNP \
  *   -recalFile path/to/output.recal \
  *   -tranchesFile path/to/output.tranches \
  *   -rscriptFile path/to/output.plots.R
  * 
* - *

Caveat

+ *

Caveats

* * @@ -210,7 +223,7 @@ public class VariantRecalibrator extends RodWalker