Merge branch 'master' into help

This commit is contained in:
Mark DePristo 2011-08-18 14:05:33 -04:00
commit f2f51e35e3
5 changed files with 112 additions and 28 deletions

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.analyzecovariates;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;

View File

@ -52,7 +52,7 @@ import java.util.ResourceBundle;
import java.util.regex.Pattern; import java.util.regex.Pattern;
/** /**
* Second pass of the recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. * Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
* *
* <p> * <p>
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each

View File

@ -45,10 +45,43 @@ import java.io.FileNotFoundException;
import java.util.*; import java.util.*;
/** /**
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration
*
* <p>
* Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value
* and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level
* have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered
* to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
* slightly lower quality level.
*
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* The input raw variants to be recalibrated.
* <p>
* The recalibration table file in CSV format that was generated by the VariantRecalibrator walker.
* <p>
* The tranches file that was generated by the VariantRecalibrator walker.
*
* <h2>Output</h2>
* <p>
* A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level.
*
* <h2>Examples</h2>
* <pre>
* java -Xmx3g -jar GenomeAnalysisTK.jar \
* -T ApplyRecalibration \
* -R reference/human_g1k_v37.fasta \
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
* --ts_filter_level 99.0 \
* -tranchesFile path/to/output.tranches \
* -recalFile path/to/output.recal \
* -o path/to/output.recalibrated.filtered.vcf
* </pre>
* *
* @author rpoplin
* @since Mar 14, 2011
*/ */
public class ApplyRecalibration extends RodWalker<Integer, Integer> { public class ApplyRecalibration extends RodWalker<Integer, Integer> {
@ -57,11 +90,11 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
// Inputs // Inputs
///////////////////////////// /////////////////////////////
/** /**
* The raw input variants to be recalibrated. * These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
*/ */
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input; public List<RodBinding<VariantContext>> input;
@Input(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true) @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true)
private File RECAL_FILE; private File RECAL_FILE;
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true) @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
private File TRANCHES_FILE; private File TRANCHES_FILE;
@ -69,7 +102,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
///////////////////////////// /////////////////////////////
// Outputs // Outputs
///////////////////////////// /////////////////////////////
@Output( doc="The output filtered, recalibrated VCF file", required=true) @Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true)
private VCFWriter vcfWriter = null; private VCFWriter vcfWriter = null;
///////////////////////////// /////////////////////////////
@ -77,7 +110,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
///////////////////////////// /////////////////////////////
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
private double TS_FILTER_LEVEL = 99.0; private double TS_FILTER_LEVEL = 99.0;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false) @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null; private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false) @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false)
public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP;

View File

@ -45,10 +45,54 @@ import java.io.PrintStream;
import java.util.*; import java.util.*;
/** /**
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants.
*
* <p>
* This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker.
*
* <p>
* The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set.
* One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call.
* The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship
* between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic
* variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided
* as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive
* error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the
* probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is
* the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.
*
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* The input raw variants to be recalibrated.
* <p>
* Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below.
*
* <h2>Output</h2>
* <p>
* A recalibration table file in CSV format that is used by the ApplyRecalibration walker.
* <p>
* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T VariantRecalibrator \
* -R reference/human_g1k_v37.fasta \
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
* -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
* -known:prior=8.0 dbsnp_132.b37.vcf \
* -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
* -recalFile path/to/output.recal \
* -tranchesFile path/to/output.tranches \
* -rscriptFile path/to/output.plots.R
* </pre>
* *
* User: rpoplin
* Date: 3/12/11
*/ */
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> { public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
@ -62,36 +106,32 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Inputs // Inputs
///////////////////////////// /////////////////////////////
/** /**
* The raw input variants to be recalibrated. * These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
*/ */
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input; public List<RodBinding<VariantContext>> input;
/** /**
* A list of training variants used to train the Gaussian mixture model.
*
* Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model. * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
*/ */
@Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true) @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
public List<RodBinding<VariantContext>> training; public List<RodBinding<VariantContext>> training;
/** /**
* A list of true variants to be used when deciding the truth sensitivity cut of the final callset.
*
* When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used. * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
* Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example. * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
*/ */
@Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true) @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
public List<RodBinding<VariantContext>> truth; public List<RodBinding<VariantContext>> truth;
/** /**
* A list of known variants to be used for metric comparison purposes.
*
* The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes. * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
* The output metrics are stratified by known status in order to aid in comparisons with other call sets. * The output metrics are stratified by known status in order to aid in comparisons with other call sets.
*/ */
@Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false) @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
public List<RodBinding<VariantContext>> known = Collections.emptyList(); public List<RodBinding<VariantContext>> known = Collections.emptyList();
/** /**
* A list of known bad variants used to supplement training the negative model.
*
* In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
* with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example). * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
*/ */
@ -109,13 +149,25 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
///////////////////////////// /////////////////////////////
// Additional Command Line Arguments // Additional Command Line Arguments
///////////////////////////// /////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false) /**
* The expected transition / tranversion ratio of true novel variants in your targeted region (whole genome, exome, specific
* genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best
* practices wiki documentation for more information. Normal whole genome values are 2.15 and for whole exome 3.2. Note
* that this parameter is used for display purposes only and isn't used anywhere in the algorithm!
*/
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
private double TARGET_TITV = 2.15; private double TARGET_TITV = 2.15;
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true) @Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
private String[] USE_ANNOTATIONS = null; private String[] USE_ANNOTATIONS = null;
/**
* Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0
* which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible
* sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%.
*/
@Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false) @Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0}; private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0};
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false) @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null; private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false) @Argument(fullName="path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
private String PATH_TO_RSCRIPT = "Rscript"; private String PATH_TO_RSCRIPT = "Rscript";
@ -123,7 +175,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private String RSCRIPT_FILE = null; private String RSCRIPT_FILE = null;
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false) @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
private String PATH_TO_RESOURCES = "public/R/"; private String PATH_TO_RESOURCES = "public/R/";
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in plots", required=false) @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in the model reporting plots", required=false)
private double TS_FILTER_LEVEL = 99.0; private double TS_FILTER_LEVEL = 99.0;
///////////////////////////// /////////////////////////////

View File

@ -53,14 +53,14 @@ public class VariantRecalibratorArgumentCollection {
public double STD_THRESHOLD = 14.0; public double STD_THRESHOLD = 14.0;
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.", required=false) @Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.", required=false)
public double QUAL_THRESHOLD = 80.0; public double QUAL_THRESHOLD = 80.0;
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false) @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false)
public double SHRINKAGE = 1.0; public double SHRINKAGE = 1.0;
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false) @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false)
public double DIRICHLET_PARAMETER = 0.001; public double DIRICHLET_PARAMETER = 0.001;
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false) @Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false)
public double PRIOR_COUNTS = 20.0; public double PRIOR_COUNTS = 20.0;
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false) @Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
public double PERCENT_BAD_VARIANTS = 0.03; public double PERCENT_BAD_VARIANTS = 0.03;
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false) @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad argument if necessary.", required=false)
public int MIN_NUM_BAD_VARIANTS = 2000; public int MIN_NUM_BAD_VARIANTS = 2000;
} }