diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index 9b316f077..2ff8aa979 100755 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.analyzecovariates; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index af7148803..85166d30d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -52,7 +52,7 @@ import java.util.ResourceBundle; import java.util.regex.Pattern; /** - * Second pass of the recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. + * Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. * *
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index abe27e483..16f1abf1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -45,10 +45,43 @@ import java.io.FileNotFoundException; import java.util.*; /** - * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration + * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration + * + *
+ * Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value + * and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level + * have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered + * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a + * slightly lower quality level. + * + *
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration + * + *
+ * The input raw variants to be recalibrated. + *
+ * The recalibration table file in CSV format that was generated by the VariantRecalibrator walker. + *
+ * The tranches file that was generated by the VariantRecalibrator walker. + * + *
+ * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level. + * + *
+ * java -Xmx3g -jar GenomeAnalysisTK.jar \ + * -T ApplyRecalibration \ + * -R reference/human_g1k_v37.fasta \ + * -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \ + * --ts_filter_level 99.0 \ + * -tranchesFile path/to/output.tranches \ + * -recalFile path/to/output.recal \ + * -o path/to/output.recalibrated.filtered.vcf + ** - * @author rpoplin - * @since Mar 14, 2011 */ public class ApplyRecalibration extends RodWalker
+ * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker. + * + *
+ * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. + * One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. + * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship + * between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic + * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided + * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive + * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the + * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is + * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. + * + *
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration + * + *
+ * The input raw variants to be recalibrated. + *
+ * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. + * + *
+ * A recalibration table file in CSV format that is used by the ApplyRecalibration walker. + *
+ * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. + * + *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \ + * -T VariantRecalibrator \ + * -R reference/human_g1k_v37.fasta \ + * -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \ + * -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \ + * -training:prior=15.0 hapmap_3.3.b37.sites.vcf \ + * -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \ + * -known:prior=8.0 dbsnp_132.b37.vcf \ + * -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \ + * -recalFile path/to/output.recal \ + * -tranchesFile path/to/output.tranches \ + * -rscriptFile path/to/output.plots.R + ** - * User: rpoplin - * Date: 3/12/11 */ public class VariantRecalibrator extends RodWalker