Added GATKDocs to the UnifiedGenotyper.
This commit is contained in:
parent
6b256a8ac5
commit
09d099cada
|
|
@ -44,7 +44,9 @@ import java.util.Set;
|
|||
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||
|
||||
public enum Model {
|
||||
/** The default model with the best performance in all cases */
|
||||
EXACT,
|
||||
/** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */
|
||||
GRID_SEARCH
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -53,7 +53,9 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
|||
}
|
||||
|
||||
public enum GENOTYPING_MODE {
|
||||
/** the default; the Unified Genotyper will choose the most likely alternate allele */
|
||||
DISCOVERY,
|
||||
/** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */
|
||||
GENOTYPE_GIVEN_ALLELES
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,31 +36,54 @@ import java.io.File;
|
|||
|
||||
public class UnifiedArgumentCollection {
|
||||
|
||||
// control the various models to be used
|
||||
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
|
||||
|
||||
/**
|
||||
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
|
||||
*/
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
|
||||
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
|
||||
|
||||
/**
|
||||
* The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are:
|
||||
* het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
|
||||
*/
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||
|
||||
@Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false)
|
||||
public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE;
|
||||
|
||||
/**
|
||||
* Specifies how to determine the alternate allele to use for genotyping
|
||||
*/
|
||||
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
|
||||
|
||||
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
|
||||
public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
|
||||
|
||||
/**
|
||||
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
|
||||
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
|
||||
* is the default). Note that the confidence (QUAL) values for multi-sample low-pass (e.g. 4x per sample) calling might
|
||||
* be significantly smaller with the new EXACT model than with our older GRID_SEARCH model, as the latter tended to
|
||||
* over-estimate the confidence; for low-pass calling we tend to use much smaller thresholds (e.g. 4).
|
||||
*/
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
|
||||
|
||||
/**
|
||||
* the minimum phred-scaled Qscore threshold to emit low confidence calls. Genotypes with confidence >= this but less
|
||||
* than the calling threshold are emitted but marked as filtered.
|
||||
*/
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||
|
||||
/**
|
||||
* This argument is not enabled by default because it increases the runtime by an appreciable amount.
|
||||
*/
|
||||
@Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false)
|
||||
public boolean COMPUTE_SLOD = false;
|
||||
|
||||
|
|
@ -80,7 +103,6 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "abort_at_too_much_coverage", doc = "Don't call a site if the downsampled coverage is greater than this value", required = false)
|
||||
public int COVERAGE_AT_WHICH_TO_ABORT = -1;
|
||||
|
||||
|
||||
// control the various parameters to be used
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||
public int MIN_BASE_QUALTY_SCORE = 17;
|
||||
|
|
@ -91,11 +113,17 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
|
||||
// indel-related arguments
|
||||
/**
|
||||
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
|
||||
* Decreasing this value will increase sensitivity but at the cost of larger calling time and a larger number of false positives.
|
||||
*/
|
||||
@Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false)
|
||||
public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5;
|
||||
|
||||
/**
|
||||
* This argument informs the prior probability of having an indel at a site.
|
||||
*/
|
||||
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
|
||||
public double INDEL_HETEROZYGOSITY = 1.0/8000;
|
||||
|
||||
|
|
@ -126,22 +154,23 @@ public class UnifiedArgumentCollection {
|
|||
@Hidden
|
||||
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
|
||||
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false)
|
||||
public boolean dovit = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false)
|
||||
public boolean GSA_PRODUCTION_ONLY = false;
|
||||
|
||||
@Hidden
|
||||
|
||||
@Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false)
|
||||
public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
|
||||
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
|
||||
public boolean IGNORE_SNP_ALLELES = false;
|
||||
|
||||
|
||||
@Deprecated
|
||||
@Argument(fullName="output_all_callable_bases", shortName="all_bases", doc="Please use --output_mode EMIT_ALL_SITES instead" ,required=false)
|
||||
private Boolean ALL_BASES_DEPRECATED = false;
|
||||
|
|
|
|||
|
|
@ -45,11 +45,71 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
|
||||
* multi-sample data. The user can choose from several different incorporated calculation models.
|
||||
* A variant caller which unifies the approaches of several disparate callers -- Works for single-sample and multi-sample data.
|
||||
*
|
||||
* <p>
|
||||
* The GATK Unified Genotyper is a multiple-sample, technology-aware SNP and indel caller. It uses a Bayesian genotype
|
||||
* likelihood model to estimate simultaneously the most likely genotypes and allele frequency in a population of N samples,
|
||||
* emitting an accurate posterior probability of there being a segregating variant allele at each locus as well as for the
|
||||
* genotype of each sample. The system can either emit just the variant sites or complete genotypes (which includes
|
||||
* homozygous reference calls) satisfying some phred-scaled confidence value. The genotyper can make accurate calls on
|
||||
* both single sample data and multi-sample data.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The read data from which to make variant calls.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A raw, unfiltered, highly specific callset in VCF format.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Example generic command for multi-sample SNP calling</h2>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -R resources/Homo_sapiens_assembly18.fasta \
|
||||
* -T UnifiedGenotyper \
|
||||
* -I sample1.bam [-I sample2.bam ...] \
|
||||
* --dbsnp dbSNP.vcf \
|
||||
* -o snps.raw.vcf \
|
||||
* -stand_call_conf [50.0] \
|
||||
* -stand_emit_conf 10.0 \
|
||||
* -dcov [50] \
|
||||
* [-L targets.interval_list]
|
||||
* </pre>
|
||||
*
|
||||
* <p>
|
||||
* The above command will call all of the samples in your provided BAM files [-I arguments] together and produce a VCF file
|
||||
* with sites and genotypes for all samples. The easiest way to get the dbSNP file is from the GATK resource bundle. Several
|
||||
* arguments have parameters that should be chosen based on the average coverage per sample in your data. See the detailed
|
||||
* argument descriptions below.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Example command for generating calls at all sites</h2>
|
||||
* <pre>
|
||||
* java -jar /path/to/GenomeAnalysisTK.jar \
|
||||
* -l INFO \
|
||||
* -R resources/Homo_sapiens_assembly18.fasta \
|
||||
* -T UnifiedGenotyper \
|
||||
* -I /DCC/ftp/pilot_data/data/NA12878/alignment/NA12878.SLX.maq.SRP000031.2009_08.bam \
|
||||
* -o my.vcf \
|
||||
* --output_mode EMIT_ALL_SITES
|
||||
* </pre>
|
||||
*
|
||||
* <h2>Caveats</h2>
|
||||
* <ul>
|
||||
* <li>The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and
|
||||
* file formats are likely to change.</li>
|
||||
* <li>The system can be very aggressive in calling variants. In the 1000 genomes project for pilot 2 (deep coverage of ~35x)
|
||||
* we expect the raw Qscore > 50 variants to contain at least ~10% FP calls. We use extensive post-calling filters to eliminate
|
||||
* most of these FPs. Variant Quality Score Recalibration is a tool to perform this filtering.</li>
|
||||
* <li>We only handle diploid genotypes</li>
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
|
||||
@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableReadFilter.class} )
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
|
|
@ -61,10 +121,9 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
/**
|
||||
* A dbSNP VCF file from which to annotate.
|
||||
*
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
*/
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
* dbSNP is not used in any way for the calculations themselves.
|
||||
*/
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
||||
|
|
@ -72,7 +131,9 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
|
||||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||
|
||||
// control the output
|
||||
/**
|
||||
* A raw, unfiltered, highly specific callset in VCF format.
|
||||
*/
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
|
|
@ -82,9 +143,15 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
@Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false)
|
||||
protected PrintStream metricsWriter = null;
|
||||
|
||||
/**
|
||||
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
||||
*/
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
||||
*/
|
||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||
protected String[] annotationClassesToUse = { "Standard" };
|
||||
|
||||
|
|
|
|||
|
|
@ -51,8 +51,11 @@ public class UnifiedGenotyperEngine {
|
|||
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
|
||||
|
||||
public enum OUTPUT_MODE {
|
||||
/** the default */
|
||||
EMIT_VARIANTS_ONLY,
|
||||
/** include confident reference sites */
|
||||
EMIT_ALL_CONFIDENT_SITES,
|
||||
/** any callable site regardless of confidence */
|
||||
EMIT_ALL_SITES
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -68,6 +68,8 @@ import java.util.Map;
|
|||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The input read data whose base quality scores need to be assessed.
|
||||
* <p>
|
||||
* A database of known polymorphic sites to skip over.
|
||||
* </p>
|
||||
*
|
||||
|
|
@ -134,6 +136,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
||||
private boolean LIST_ONLY = false;
|
||||
|
||||
/**
|
||||
* See the -list argument to view available covariates.
|
||||
*/
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
|
||||
|
|
|
|||
|
|
@ -66,6 +66,8 @@ import java.util.regex.Pattern;
|
|||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The input read data whose base quality scores need to be recalibrated.
|
||||
* <p>
|
||||
* The recalibration table file in CSV format that was generated by the CountCovariates walker.
|
||||
* </p>
|
||||
*
|
||||
|
|
|
|||
|
|
@ -157,6 +157,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
*/
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
|
||||
private double TARGET_TITV = 2.15;
|
||||
|
||||
/**
|
||||
* See the input VCF file's INFO field for a list of all available annotations.
|
||||
*/
|
||||
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
|
||||
private String[] USE_ANNOTATIONS = null;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue