diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 6a5d34a6f..7658f042c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -54,26 +54,39 @@ import java.io.FileNotFoundException; import java.util.*; /** - * Selects variants from a VCF source. + * Select a subset of variants from a larger callset * *

* Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses * (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain * requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose. - * Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a - * pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of - * coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are - * documented in the Using JEXL expressions section (http://www.broadinstitute.org/gatk/guide/article?id=1255). - * One can optionally include concordance or discordance tracks for use in selecting overlapping variants. - * + *

+ *

+ * There are many different options for selecting subsets of variants from a larger callset: + *

+ *

+ * + *

There are also several options for recording the original values of certain annotations that are recalculated + * when a subsetting the new callset, trimming alleles, and so on.

+ * *

Input

*

- * A variant set to select from. + * A variant call set from which to select a subset. *

* *

Output

*

- * A selected VCF. + * The name of the VCF file to which to write the selected subset of variants. *

* *

Examples

@@ -103,7 +116,7 @@ import java.util.*; * -T SelectVariants \ * --variant input.vcf \ * -o output.vcf \ - * -se 'SAMPLE.+PARC' + * -se 'SAMPLE.+PARC' \ * -select "QD > 10.0" * * Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default): @@ -135,30 +148,31 @@ import java.util.*; * -L /path/to/my.interval_list \ * -sn SAMPLE_1_ACTG * - * Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called by this dataset): + * Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset): * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T SelectVariants \ * --variant hapmap.vcf \ - * --discordance myCalls.vcf + * --discordance myCalls.vcf \ * -o output.vcf \ * -sn mySample * - * Select all calls made by both myCalls and hisCalls (useful to take a look at what is consistent between the two callers): + * Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers): * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T SelectVariants \ * --variant myCalls.vcf \ - * --concordance hisCalls.vcf + * --concordance hisCalls.vcf \ * -o output.vcf \ * -sn mySample * - * Generating a VCF of all the variants that are mendelian violations: + * Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more: * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T SelectVariants \ * --variant input.vcf \ * -ped family.ped \ + * -mv \ * -mvq 50 \ * -o violations.vcf * @@ -199,7 +213,7 @@ public class SelectVariants extends RodWalker implements TreeR * and either the site isn't present in this track, the sample isn't present in this track, * or the sample is called reference in this track. */ - @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false) + @Input(fullName="discordance", shortName = "disc", doc="Output variants not called in this comparison track", required=false) protected RodBinding discordanceTrack; /** @@ -207,110 +221,158 @@ public class SelectVariants extends RodWalker implements TreeR * in both the variant and concordance tracks or (2) every sample present in the variant track is present in the * concordance track and they have the sample genotype call. */ - @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false) + @Input(fullName="concordance", shortName = "conc", doc="Output variants also called in this comparison track", required=false) protected RodBinding concordanceTrack; @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; - @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) + /** + * This argument can be specified multiple times in order to provide multiple sample names. + */ + @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample", required=false) public Set sampleNames = new HashSet<>(0); - @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) + /** + * Using a regular expression allows you to match multiple sample names that have that pattern in common. This + * argument can be specified multiple times in order to use multiple different matching patterns. + */ + @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select multiple samples", required=false) public Set sampleExpressions ; - @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) + /** + * Sample names should be in a plain text file listing one sample name per line. This argument can be specified multiple times in order to provide + * multiple sample list files. + */ + @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples to include", required=false) public Set sampleFiles; /** - * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. + * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be + * excluded. This argument can be specified multiple times in order to provide multiple sample names. */ - @Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false) + @Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample", required=false) public Set XLsampleNames = new HashSet<>(0); /** - * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. + * Sample names should be in a plain text file listing one sample name per line. Note that sample exclusion takes precedence over inclusion, so that + * if a sample is in both lists it will be excluded. This argument can be specified multiple times in order to + * provide multiple sample list files. */ - @Input(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false) + @Input(fullName="exclude_sample_file", shortName="xl_sf", doc="List of samples to exclude", required=false) public Set XLsampleFiles = new HashSet<>(0); /** - * Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated. + * See example commands above for detailed usage examples. Note that these expressions are evaluated *after* the + * specified samples are extracted and the INFO field annotations are updated. */ @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false) public ArrayList SELECT_EXPRESSIONS = new ArrayList<>(); - @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) + /** + * If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none + * of the selected samples display evidence of variation) will be excluded from the output. + */ + @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include non-variant sites", required=false) protected boolean EXCLUDE_NON_VARIANTS = false; - @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) + /** + * If this flag is enabled, sites that have been marked as filtered (i.e. have anything other than `.` or `PASS` + * in the FILTER field) will be excluded from the output. + */ + @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered sites", required=false) protected boolean EXCLUDE_FILTERED = false; /** - * Default is to remove bases common to all remaining alleles, leaving only their minimal representation. - * If this argument is set, original alleles from input VCF will be preserved. + * The default behavior of this tool is to remove bases common to all remaining alleles after subsetting + * operations have been completed, leaving only their minimal representation. If this flag is enabled, the original + * alleles will be preserved as recorded in the input VCF. */ @Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false) protected boolean preserveAlleles = false; /** - * When this argument is present, all alternate alleles that are not present in the (output) samples will be removed. + * When this flag is enabled, all alternate alleles that are not present in the (output) samples will be removed. * Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be - * removed and the record will contain a '.' in the ALT column. Also note that sites-only VCFs, by definition, do + * removed and the record will contain a '.' in the ALT column. Note also that sites-only VCFs, by definition, do * not include the alternate allele in any genotype calls. */ @Argument(fullName="removeUnusedAlternates", shortName="trimAlternates", doc="Remove alternate alleles not present in any genotypes", required=false) protected boolean removeUnusedAlternates = false; /** - * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf. + * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a VCF. * For example, a multiallelic record such as: - * 1 100 . A AAA,AAAAA - * will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as: - * 1 100 . A T - * will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC + * 1 100 . A AAA,AAAAA + * will be excluded if `-restrictAllelesTo BIALLELIC` is used, because there are two alternate alleles, whereas a record such as: + * 1 100 . A T + * will be included in that case, but would be excluded if `-restrictAllelesTo MULTIALLELIC` is used. + * Valid options are ALL (default), MULTIALLELIC or BIALLELIC. */ - @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false) + @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity", required=false) private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL; - @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values in the INFO field after selecting (using keys AC_Orig, AF_Orig, and AN_Orig)", required=false) + /** + * When subsetting a callset, this tool recalculates the AC, AF, and AN values corresponding to the contents of the + * subset. If this flag is enabled, the original values of those annotations will be stored in new annotations called + * AC_Orig, AF_Orig, and AN_Orig. + */ + @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values after subsetting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; - @Argument(fullName="keepOriginalDP", shortName="keepOriginalDP", doc="Store the original DP value in the INFO field (using the DP_Orig key) after selecting", required=false) + /** + * When subsetting a callset, this tool recalculates the site-level (INFO field) DP value corresponding to the contents of the + * subset. If this flag is enabled, the original value of the DP annotation will be stored in a new annotation called + * DP_Orig. + */ + @Argument(fullName="keepOriginalDP", shortName="keepOriginalDP", doc="Store the original DP value after subsetting", required=false) private boolean KEEP_ORIGINAL_DEPTH = false; /** - * This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure. + * If this flag is enabled, this tool will select only variants that correspond to a mendelian violation as + * determined on the basis of family structure. Requires passing a pedigree file using the engine-level + * `-ped` argument. */ - @Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false) + @Argument(fullName="mendelianViolation", shortName="mv", doc="Output mendelian violation sites only", required=false) private Boolean MENDELIAN_VIOLATIONS = false; - @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) + /** + * This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order + * for a site to be accepted as a mendelian violation. Note that the `-mv` flag must be set for this argument to have an effect. + */ + @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum GQ score for each trio member to accept a site as a violation", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; /** - * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. + * The value of this argument should be a number between 0 and 1 specifying the fraction of total variants to be + * randomly selected from the input callset. Note that this is done using a probabilistic function, so the final + * result is not guaranteed to carry the exact fraction requested. Can be used for large fractions. */ - @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) + @Argument(fullName="select_random_fraction", shortName="fraction", doc="Select a fraction of variants at random from the input", required=false) protected double fractionRandom = 0; - @Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false) + /** + * The value of this argument should be a number between 0 and 1 specifying the fraction of total variants to be + * randomly selected from the input callset and set to no-call (./). Note that this is done using a probabilistic + * function, so the final result is not guaranteed to carry the exact fraction requested. Can be used for large fractions. + */ + @Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Select a fraction of genotypes at random from the input and sets them to no-call", required=false) protected double fractionGenotypes = 0; /** - * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. - * When specified one or more times, a particular type of variant is selected. - * + * This argument selects particular kinds of variants out of a list. If left empty, there is no type selection + * and all variant types are considered for other selection criteria. Valid types are INDEL, SNP, MIXED, MNP, + * SYMBOLIC, NO_VARIATION. Can be specified multiple times. */ - @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) + @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file", required=false) private List TYPES_TO_INCLUDE = new ArrayList<>(); /** - * If provided, we will only include variants whose ID field is present in this list of ids. The matching - * is exact string matching. The file format is just one ID per line - * + * If a file containing a list of IDs is provided to this argument, the tool will only select variants whose ID + * field is present in this list of IDs. The matching is done by exact string matching. The expected file format + * is simply plain text with one ID per line. */ - @Argument(fullName="keepIDs", shortName="IDs", doc="Only emit sites whose ID is found in this file (one ID per line)", required=false) + @Argument(fullName="keepIDs", shortName="IDs", doc="List of variant IDs to select", required=false) private File rsIDFile = null; @@ -326,10 +388,14 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false) private boolean justRead = false; - @Argument(doc="indel size select",required=false,fullName="maxIndelSize") + /** + * If this argument is provided, indels that are larger than the specified siwe will be excluded. + */ + @Argument(fullName="maxIndelSize", required=false, doc="Maximum size of indels to include") private int maxIndelSize = Integer.MAX_VALUE; - @Argument(doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES") + @Hidden + @Argument(fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", required=false, doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.") private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;