Merge branch 'master' into help
This commit is contained in:
commit
f2f51e35e3
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.analyzecovariates;
|
|||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ import java.util.ResourceBundle;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Second pass of the recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
|
||||
* Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
|
||||
*
|
||||
* <p>
|
||||
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each
|
||||
|
|
|
|||
|
|
@ -45,10 +45,43 @@ import java.io.FileNotFoundException;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
|
||||
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration
|
||||
*
|
||||
* <p>
|
||||
* Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value
|
||||
* and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level
|
||||
* have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered
|
||||
* to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
|
||||
* slightly lower quality level.
|
||||
*
|
||||
* <p>
|
||||
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
|
||||
* http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The input raw variants to be recalibrated.
|
||||
* <p>
|
||||
* The recalibration table file in CSV format that was generated by the VariantRecalibrator walker.
|
||||
* <p>
|
||||
* The tranches file that was generated by the VariantRecalibrator walker.
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level.
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx3g -jar GenomeAnalysisTK.jar \
|
||||
* -T ApplyRecalibration \
|
||||
* -R reference/human_g1k_v37.fasta \
|
||||
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
|
||||
* --ts_filter_level 99.0 \
|
||||
* -tranchesFile path/to/output.tranches \
|
||||
* -recalFile path/to/output.recal \
|
||||
* -o path/to/output.recalibrated.filtered.vcf
|
||||
* </pre>
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Mar 14, 2011
|
||||
*/
|
||||
|
||||
public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
||||
|
|
@ -57,11 +90,11 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
// Inputs
|
||||
/////////////////////////////
|
||||
/**
|
||||
* The raw input variants to be recalibrated.
|
||||
* These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
|
||||
*/
|
||||
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
|
||||
public List<RodBinding<VariantContext>> input;
|
||||
@Input(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true)
|
||||
@Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true)
|
||||
private File RECAL_FILE;
|
||||
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
|
||||
private File TRANCHES_FILE;
|
||||
|
|
@ -69,7 +102,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output( doc="The output filtered, recalibrated VCF file", required=true)
|
||||
@Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
/////////////////////////////
|
||||
|
|
@ -77,7 +110,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
/////////////////////////////
|
||||
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
|
||||
private double TS_FILTER_LEVEL = 99.0;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false)
|
||||
public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP;
|
||||
|
|
|
|||
|
|
@ -45,10 +45,54 @@ import java.io.PrintStream;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score
|
||||
* Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants.
|
||||
*
|
||||
* <p>
|
||||
* This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker.
|
||||
*
|
||||
* <p>
|
||||
* The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set.
|
||||
* One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call.
|
||||
* The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship
|
||||
* between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic
|
||||
* variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided
|
||||
* as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive
|
||||
* error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the
|
||||
* probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is
|
||||
* the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.
|
||||
*
|
||||
* <p>
|
||||
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
|
||||
* http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The input raw variants to be recalibrated.
|
||||
* <p>
|
||||
* Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below.
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A recalibration table file in CSV format that is used by the ApplyRecalibration walker.
|
||||
* <p>
|
||||
* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -T VariantRecalibrator \
|
||||
* -R reference/human_g1k_v37.fasta \
|
||||
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
|
||||
* -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
|
||||
* -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
|
||||
* -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
|
||||
* -known:prior=8.0 dbsnp_132.b37.vcf \
|
||||
* -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
|
||||
* -recalFile path/to/output.recal \
|
||||
* -tranchesFile path/to/output.tranches \
|
||||
* -rscriptFile path/to/output.plots.R
|
||||
* </pre>
|
||||
*
|
||||
* User: rpoplin
|
||||
* Date: 3/12/11
|
||||
*/
|
||||
|
||||
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
|
||||
|
|
@ -62,36 +106,32 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
// Inputs
|
||||
/////////////////////////////
|
||||
/**
|
||||
* The raw input variants to be recalibrated.
|
||||
* These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
|
||||
*/
|
||||
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
|
||||
public List<RodBinding<VariantContext>> input;
|
||||
|
||||
/**
|
||||
* A list of training variants used to train the Gaussian mixture model.
|
||||
*
|
||||
* Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
|
||||
*/
|
||||
@Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
|
||||
public List<RodBinding<VariantContext>> training;
|
||||
|
||||
/**
|
||||
* A list of true variants to be used when deciding the truth sensitivity cut of the final callset.
|
||||
*
|
||||
* When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
|
||||
* Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
|
||||
*/
|
||||
@Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
|
||||
public List<RodBinding<VariantContext>> truth;
|
||||
|
||||
/**
|
||||
* A list of known variants to be used for metric comparison purposes.
|
||||
*
|
||||
* The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
|
||||
* The output metrics are stratified by known status in order to aid in comparisons with other call sets.
|
||||
*/
|
||||
@Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
|
||||
public List<RodBinding<VariantContext>> known = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* A list of known bad variants used to supplement training the negative model.
|
||||
*
|
||||
* In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
|
||||
* with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
|
||||
*/
|
||||
|
|
@ -109,13 +149,25 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
/////////////////////////////
|
||||
// Additional Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
|
||||
/**
|
||||
* The expected transition / tranversion ratio of true novel variants in your targeted region (whole genome, exome, specific
|
||||
* genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best
|
||||
* practices wiki documentation for more information. Normal whole genome values are 2.15 and for whole exome 3.2. Note
|
||||
* that this parameter is used for display purposes only and isn't used anywhere in the algorithm!
|
||||
*/
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
|
||||
private double TARGET_TITV = 2.15;
|
||||
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
|
||||
private String[] USE_ANNOTATIONS = null;
|
||||
|
||||
/**
|
||||
* Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0
|
||||
* which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible
|
||||
* sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%.
|
||||
*/
|
||||
@Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0};
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
|
||||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
|
|
@ -123,7 +175,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private String RSCRIPT_FILE = null;
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
|
||||
private String PATH_TO_RESOURCES = "public/R/";
|
||||
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in plots", required=false)
|
||||
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in the model reporting plots", required=false)
|
||||
private double TS_FILTER_LEVEL = 99.0;
|
||||
|
||||
/////////////////////////////
|
||||
|
|
|
|||
|
|
@ -53,14 +53,14 @@ public class VariantRecalibratorArgumentCollection {
|
|||
public double STD_THRESHOLD = 14.0;
|
||||
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.", required=false)
|
||||
public double QUAL_THRESHOLD = 80.0;
|
||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
|
||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false)
|
||||
public double SHRINKAGE = 1.0;
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false)
|
||||
public double DIRICHLET_PARAMETER = 0.001;
|
||||
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false)
|
||||
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false)
|
||||
public double PRIOR_COUNTS = 20.0;
|
||||
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
|
||||
public double PERCENT_BAD_VARIANTS = 0.03;
|
||||
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
|
||||
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad argument if necessary.", required=false)
|
||||
public int MIN_NUM_BAD_VARIANTS = 2000;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue