Merge branch 'master' into help

This commit is contained in:
Mark DePristo 2011-08-18 14:05:33 -04:00
commit f2f51e35e3
5 changed files with 112 additions and 28 deletions

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.analyzecovariates;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;

View File

@ -52,7 +52,7 @@ import java.util.ResourceBundle;
import java.util.regex.Pattern;
/**
* Second pass of the recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
* Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
*
* <p>
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each

View File

@ -45,10 +45,43 @@ import java.io.FileNotFoundException;
import java.util.*;
/**
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration
*
* <p>
* Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value
* and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level
* have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered
* to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
* slightly lower quality level.
*
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* The input raw variants to be recalibrated.
* <p>
* The recalibration table file in CSV format that was generated by the VariantRecalibrator walker.
* <p>
* The tranches file that was generated by the VariantRecalibrator walker.
*
* <h2>Output</h2>
* <p>
* A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level.
*
* <h2>Examples</h2>
* <pre>
* java -Xmx3g -jar GenomeAnalysisTK.jar \
* -T ApplyRecalibration \
* -R reference/human_g1k_v37.fasta \
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
* --ts_filter_level 99.0 \
* -tranchesFile path/to/output.tranches \
* -recalFile path/to/output.recal \
* -o path/to/output.recalibrated.filtered.vcf
* </pre>
*
* @author rpoplin
* @since Mar 14, 2011
*/
public class ApplyRecalibration extends RodWalker<Integer, Integer> {
@ -57,11 +90,11 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
// Inputs
/////////////////////////////
/**
* The raw input variants to be recalibrated.
* These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
*/
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input;
@Input(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true)
@Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true)
private File RECAL_FILE;
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
private File TRANCHES_FILE;
@ -69,7 +102,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
/////////////////////////////
// Outputs
/////////////////////////////
@Output( doc="The output filtered, recalibrated VCF file", required=true)
@Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true)
private VCFWriter vcfWriter = null;
/////////////////////////////
@ -77,7 +110,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
/////////////////////////////
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
private double TS_FILTER_LEVEL = 99.0;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false)
public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP;

View File

@ -45,10 +45,54 @@ import java.io.PrintStream;
import java.util.*;
/**
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score
* Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants.
*
* <p>
* This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker.
*
* <p>
* The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set.
* One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call.
* The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship
* between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic
* variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided
* as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive
* error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the
* probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is
* the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.
*
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* The input raw variants to be recalibrated.
* <p>
* Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below.
*
* <h2>Output</h2>
* <p>
* A recalibration table file in CSV format that is used by the ApplyRecalibration walker.
* <p>
* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T VariantRecalibrator \
* -R reference/human_g1k_v37.fasta \
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
* -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
* -known:prior=8.0 dbsnp_132.b37.vcf \
* -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
* -recalFile path/to/output.recal \
* -tranchesFile path/to/output.tranches \
* -rscriptFile path/to/output.plots.R
* </pre>
*
* User: rpoplin
* Date: 3/12/11
*/
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
@ -62,36 +106,32 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Inputs
/////////////////////////////
/**
* The raw input variants to be recalibrated.
* These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
*/
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input;
/**
* A list of training variants used to train the Gaussian mixture model.
*
* Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
*/
@Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
public List<RodBinding<VariantContext>> training;
/**
* A list of true variants to be used when deciding the truth sensitivity cut of the final callset.
*
* When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
* Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
*/
@Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
public List<RodBinding<VariantContext>> truth;
/**
* A list of known variants to be used for metric comparison purposes.
*
* The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
* The output metrics are stratified by known status in order to aid in comparisons with other call sets.
*/
@Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
public List<RodBinding<VariantContext>> known = Collections.emptyList();
/**
* A list of known bad variants used to supplement training the negative model.
*
* In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
* with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
*/
@ -109,13 +149,25 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
/////////////////////////////
// Additional Command Line Arguments
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
/**
* The expected transition / tranversion ratio of true novel variants in your targeted region (whole genome, exome, specific
* genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best
* practices wiki documentation for more information. Normal whole genome values are 2.15 and for whole exome 3.2. Note
* that this parameter is used for display purposes only and isn't used anywhere in the algorithm!
*/
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
private double TARGET_TITV = 2.15;
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
private String[] USE_ANNOTATIONS = null;
/**
* Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0
* which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible
* sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%.
*/
@Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0};
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
private String PATH_TO_RSCRIPT = "Rscript";
@ -123,7 +175,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private String RSCRIPT_FILE = null;
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
private String PATH_TO_RESOURCES = "public/R/";
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in plots", required=false)
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in the model reporting plots", required=false)
private double TS_FILTER_LEVEL = 99.0;
/////////////////////////////

View File

@ -53,14 +53,14 @@ public class VariantRecalibratorArgumentCollection {
public double STD_THRESHOLD = 14.0;
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.", required=false)
public double QUAL_THRESHOLD = 80.0;
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false)
public double SHRINKAGE = 1.0;
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false)
public double DIRICHLET_PARAMETER = 0.001;
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false)
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false)
public double PRIOR_COUNTS = 20.0;
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
public double PERCENT_BAD_VARIANTS = 0.03;
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad argument if necessary.", required=false)
public int MIN_NUM_BAD_VARIANTS = 2000;
}