Update -mv example documentation

Made general doc fixes
This commit is contained in:
Ron Levine 2015-03-30 21:41:17 -04:00 committed by Geraldine Van der Auwera
parent d7f7022dce
commit fe87484074
1 changed files with 121 additions and 55 deletions

View File

@ -54,26 +54,39 @@ import java.io.FileNotFoundException;
import java.util.*;
/**
* Selects variants from a VCF source.
* Select a subset of variants from a larger callset
*
* <p>
* Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses
* (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain
* requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose.
* Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a
* pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of
* coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are
* documented in the Using JEXL expressions section (http://www.broadinstitute.org/gatk/guide/article?id=1255).
* One can optionally include concordance or discordance tracks for use in selecting overlapping variants.
*
* </p>
* <p>
* There are many different options for selecting subsets of variants from a larger callset:
* <ul>
* <li>Extract one or more samples from a callset based on either a complete sample name or a pattern match.</li>
* <li>Specify criteria for inclusion that place thresholds on annotation values, e.g. "DP > 1000" (depth of
* coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These criteria are written
* as "JEXL expressions", which are documented in the
* <a href="http://www.broadinstitute.org/gatk/guide/article?id=1255">article about using JEXL expressions</a>.</li>
* <li>Provide concordance or discordance tracks in order to include or exclude variants that are
* also present in other given callsets.</li>
* <li>Select variants based on criteria like their type
* (e.g. INDELs only), evidence of mendelian violation, filtering status, allelicity, and so on.</li>
* </ul>
* </p>
*
* <p>There are also several options for recording the original values of certain annotations that are recalculated
* when a subsetting the new callset, trimming alleles, and so on.</p>
*
* <h3>Input</h3>
* <p>
* A variant set to select from.
* A variant call set from which to select a subset.
* </p>
*
* <h3>Output</h3>
* <p>
* A selected VCF.
* The name of the VCF file to which to write the selected subset of variants.
* </p>
*
* <h3>Examples</h3>
@ -103,7 +116,7 @@ import java.util.*;
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -se 'SAMPLE.+PARC'
* -se 'SAMPLE.+PARC' \
* -select "QD > 10.0"
*
* Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):
@ -135,30 +148,31 @@ import java.util.*;
* -L /path/to/my.interval_list \
* -sn SAMPLE_1_ACTG
*
* Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called by this dataset):
* Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant hapmap.vcf \
* --discordance myCalls.vcf
* --discordance myCalls.vcf \
* -o output.vcf \
* -sn mySample
*
* Select all calls made by both myCalls and hisCalls (useful to take a look at what is consistent between the two callers):
* Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant myCalls.vcf \
* --concordance hisCalls.vcf
* --concordance hisCalls.vcf \
* -o output.vcf \
* -sn mySample
*
* Generating a VCF of all the variants that are mendelian violations:
* Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -ped family.ped \
* -mv \
* -mvq 50 \
* -o violations.vcf
*
@ -199,7 +213,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* and either the site isn't present in this track, the sample isn't present in this track,
* or the sample is called reference in this track.
*/
@Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false)
@Input(fullName="discordance", shortName = "disc", doc="Output variants not called in this comparison track", required=false)
protected RodBinding<VariantContext> discordanceTrack;
/**
@ -207,110 +221,158 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* in both the variant and concordance tracks or (2) every sample present in the variant track is present in the
* concordance track and they have the sample genotype call.
*/
@Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false)
@Input(fullName="concordance", shortName = "conc", doc="Output variants also called in this comparison track", required=false)
protected RodBinding<VariantContext> concordanceTrack;
@Output(doc="File to which variants should be written")
protected VariantContextWriter vcfWriter = null;
@Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
/**
* This argument can be specified multiple times in order to provide multiple sample names.
*/
@Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample", required=false)
public Set<String> sampleNames = new HashSet<>(0);
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
/**
* Using a regular expression allows you to match multiple sample names that have that pattern in common. This
* argument can be specified multiple times in order to use multiple different matching patterns.
*/
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select multiple samples", required=false)
public Set<String> sampleExpressions ;
@Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
/**
* Sample names should be in a plain text file listing one sample name per line. This argument can be specified multiple times in order to provide
* multiple sample list files.
*/
@Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples to include", required=false)
public Set<File> sampleFiles;
/**
* Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
* Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be
* excluded. This argument can be specified multiple times in order to provide multiple sample names.
*/
@Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false)
@Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample", required=false)
public Set<String> XLsampleNames = new HashSet<>(0);
/**
* Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
* Sample names should be in a plain text file listing one sample name per line. Note that sample exclusion takes precedence over inclusion, so that
* if a sample is in both lists it will be excluded. This argument can be specified multiple times in order to
* provide multiple sample list files.
*/
@Input(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false)
@Input(fullName="exclude_sample_file", shortName="xl_sf", doc="List of samples to exclude", required=false)
public Set<File> XLsampleFiles = new HashSet<>(0);
/**
* Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated.
* See example commands above for detailed usage examples. Note that these expressions are evaluated *after* the
* specified samples are extracted and the INFO field annotations are updated.
*/
@Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false)
public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<>();
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false)
/**
* If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none
* of the selected samples display evidence of variation) will be excluded from the output.
*/
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include non-variant sites", required=false)
protected boolean EXCLUDE_NON_VARIANTS = false;
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
/**
* If this flag is enabled, sites that have been marked as filtered (i.e. have anything other than `.` or `PASS`
* in the FILTER field) will be excluded from the output.
*/
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered sites", required=false)
protected boolean EXCLUDE_FILTERED = false;
/**
* Default is to remove bases common to all remaining alleles, leaving only their minimal representation.
* If this argument is set, original alleles from input VCF will be preserved.
* The default behavior of this tool is to remove bases common to all remaining alleles after subsetting
* operations have been completed, leaving only their minimal representation. If this flag is enabled, the original
* alleles will be preserved as recorded in the input VCF.
*/
@Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false)
protected boolean preserveAlleles = false;
/**
* When this argument is present, all alternate alleles that are not present in the (output) samples will be removed.
* When this flag is enabled, all alternate alleles that are not present in the (output) samples will be removed.
* Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be
* removed and the record will contain a '.' in the ALT column. Also note that sites-only VCFs, by definition, do
* removed and the record will contain a '.' in the ALT column. Note also that sites-only VCFs, by definition, do
* not include the alternate allele in any genotype calls.
*/
@Argument(fullName="removeUnusedAlternates", shortName="trimAlternates", doc="Remove alternate alleles not present in any genotypes", required=false)
protected boolean removeUnusedAlternates = false;
/**
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a VCF.
* For example, a multiallelic record such as:
* 1 100 . A AAA,AAAAA
* will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as:
* 1 100 . A T
* will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC
* 1 100 . A AAA,AAAAA
* will be excluded if `-restrictAllelesTo BIALLELIC` is used, because there are two alternate alleles, whereas a record such as:
* 1 100 . A T
* will be included in that case, but would be excluded if `-restrictAllelesTo MULTIALLELIC` is used.
* Valid options are ALL (default), MULTIALLELIC or BIALLELIC.
*/
@Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false)
@Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity", required=false)
private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL;
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values in the INFO field after selecting (using keys AC_Orig, AF_Orig, and AN_Orig)", required=false)
/**
* When subsetting a callset, this tool recalculates the AC, AF, and AN values corresponding to the contents of the
* subset. If this flag is enabled, the original values of those annotations will be stored in new annotations called
* AC_Orig, AF_Orig, and AN_Orig.
*/
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values after subsetting", required=false)
private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
@Argument(fullName="keepOriginalDP", shortName="keepOriginalDP", doc="Store the original DP value in the INFO field (using the DP_Orig key) after selecting", required=false)
/**
* When subsetting a callset, this tool recalculates the site-level (INFO field) DP value corresponding to the contents of the
* subset. If this flag is enabled, the original value of the DP annotation will be stored in a new annotation called
* DP_Orig.
*/
@Argument(fullName="keepOriginalDP", shortName="keepOriginalDP", doc="Store the original DP value after subsetting", required=false)
private boolean KEEP_ORIGINAL_DEPTH = false;
/**
* This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure.
* If this flag is enabled, this tool will select only variants that correspond to a mendelian violation as
* determined on the basis of family structure. Requires passing a pedigree file using the engine-level
* `-ped` argument.
*/
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false)
@Argument(fullName="mendelianViolation", shortName="mv", doc="Output mendelian violation sites only", required=false)
private Boolean MENDELIAN_VIOLATIONS = false;
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
/**
* This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order
* for a site to be accepted as a mendelian violation. Note that the `-mv` flag must be set for this argument to have an effect.
*/
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum GQ score for each trio member to accept a site as a violation", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
/**
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
* The value of this argument should be a number between 0 and 1 specifying the fraction of total variants to be
* randomly selected from the input callset. Note that this is done using a probabilistic function, so the final
* result is not guaranteed to carry the exact fraction requested. Can be used for large fractions.
*/
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Select a fraction of variants at random from the input", required=false)
protected double fractionRandom = 0;
@Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false)
/**
* The value of this argument should be a number between 0 and 1 specifying the fraction of total variants to be
* randomly selected from the input callset and set to no-call (./). Note that this is done using a probabilistic
* function, so the final result is not guaranteed to carry the exact fraction requested. Can be used for large fractions.
*/
@Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Select a fraction of genotypes at random from the input and sets them to no-call", required=false)
protected double fractionGenotypes = 0;
/**
* This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
* When specified one or more times, a particular type of variant is selected.
*
* This argument selects particular kinds of variants out of a list. If left empty, there is no type selection
* and all variant types are considered for other selection criteria. Valid types are INDEL, SNP, MIXED, MNP,
* SYMBOLIC, NO_VARIATION. Can be specified multiple times.
*/
@Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
@Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file", required=false)
private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<>();
/**
* If provided, we will only include variants whose ID field is present in this list of ids. The matching
* is exact string matching. The file format is just one ID per line
*
* If a file containing a list of IDs is provided to this argument, the tool will only select variants whose ID
* field is present in this list of IDs. The matching is done by exact string matching. The expected file format
* is simply plain text with one ID per line.
*/
@Argument(fullName="keepIDs", shortName="IDs", doc="Only emit sites whose ID is found in this file (one ID per line)", required=false)
@Argument(fullName="keepIDs", shortName="IDs", doc="List of variant IDs to select", required=false)
private File rsIDFile = null;
@ -326,10 +388,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false)
private boolean justRead = false;
@Argument(doc="indel size select",required=false,fullName="maxIndelSize")
/**
* If this argument is provided, indels that are larger than the specified siwe will be excluded.
*/
@Argument(fullName="maxIndelSize", required=false, doc="Maximum size of indels to include")
private int maxIndelSize = Integer.MAX_VALUE;
@Argument(doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")
@Hidden
@Argument(fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", required=false, doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.")
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;