From 875c7ffa1abfcc4fd85ec43b41a37c08288e7eed Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Wed, 29 Jul 2015 18:57:59 -0400 Subject: [PATCH] Fixed typos and made some argument docs improvements --- .../walkers/annotator/VariantAnnotator.java | 26 +++--- .../walkers/variantutils/CombineVariants.java | 90 ++++++++++++------- 2 files changed, 72 insertions(+), 44 deletions(-) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java index c748f75ce..9fe3934bc 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java @@ -53,7 +53,7 @@ import java.util.*; * Annotate variant calls with context information * *

- * This tool is designed to annotate variant calls based on their context (ass opposed to functional annotation). + * This tool is designed to annotate variant calls based on their context (as opposed to functional annotation). * Various annotation modules are available; see the * documentation * for a complete list. @@ -95,9 +95,9 @@ public class VariantAnnotator extends RodWalker implements Ann /** * The INFO field will be annotated with information on the most biologically significant effect - * listed in the SnpEff output file for each variant. + * listed for each variant in the SnpEff file. */ - @Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false) + @Input(fullName="snpEffFile", shortName = "snpEffFile", doc="SnpEff file from which to get annotations", required=false) public RodBinding snpEffFile; public RodBinding getSnpEffRodBinding() { return snpEffFile; } @@ -114,7 +114,7 @@ public class VariantAnnotator extends RodWalker implements Ann * Records that are filtered in the comp track will be ignored. Note that 'dbSNP' has been special-cased * (see the --dbsnp argument). */ - @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false) public List> comps = Collections.emptyList(); public List> getCompRodBindings() { return comps; } @@ -127,7 +127,8 @@ public class VariantAnnotator extends RodWalker implements Ann * '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output * VCF, any records for which there is a record at the same position in the resource file will be annotated with * 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given - * position, one is chosen randomly. + * position, one is chosen randomly. Note also that this does not currently check for allele concordance; + * the match is based on position only. */ @Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false) public List> resources = Collections.emptyList(); @@ -184,20 +185,25 @@ public class VariantAnnotator extends RodWalker implements Ann protected Boolean USE_ALL_ANNOTATIONS = false; /** - * Note that the --list argument requires a fully resolved and correct command-line to work. As an alternative, you can use ListAnnotations (see Help Utilities). + * Note that the --list argument requires a fully resolved and correct command-line to work. As an alternative, + * you can use ListAnnotations (see Help Utilities). */ @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit", required=false) protected Boolean LIST = false; /** - * By default, the dbSNP ID is added only when the ID field in the variant VCF is empty (not already annotated). - * This argument allows you to override that behavior. This is used in conjuction with the -dbsnp argument. + * By default, a dbSNP ID is added only when the ID field in the variant record is empty (not already annotated). + * This argument allows you to override that behavior, and appends the new ID to the existing one. This is used + * in conjunction with the -dbsnp argument. */ - @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="Append the dbSNP ID even when the variant VCF already has the ID field populated", required=false) + @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="Add dbSNP ID even if one is already present", required=false) protected Boolean ALWAYS_APPEND_DBSNP_ID = false; public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; } - @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality threshold in order to annotate mendelian violation ratio") + /** + * The genotype quality (GQ) threshold above which the mendelian violation ratio should be annotated. + */ + @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="GQ threshold for annotating MV ratio") public double minGenotypeQualityP = 0.0; private VariantAnnotatorEngine engine; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java index b538225ef..61db1efb0 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java @@ -57,9 +57,9 @@ import java.util.*; * Combine variant records from different sources * *

CombineVariants reads in variants records from separate ROD (Reference-Ordered Data) sources and combines them into - * a single VCF. Any (unique) name can be used to bind your ROD and any number of sources can be input. This tool aims - * to fulfill two main possible use cases, reflected by the two combination options (MERGE and UNION), for merging - * records at the variant level (the first 8 fields of the VCF) or at the genotype level (the rest).

+ * a single VCF. Any number of sources can be input. This tool aims to fulfill two main possible use cases, reflected + * by the two combination options (MERGE and UNION), for merging records at the variant level (the first 8 fields of + * the VCF) or at the genotype level (the rest).

* * * + *

By default, the input sets will be named variants, variants2, variants3, and so on. You can override this by + * providing an explicit name tag for each input, using the syntax " -V:format,name". Each input tagged in this + * way will be labeled as such in the output (i.e., set=name rather than set=variants2). For example, you could specify + * a set of control samples as " -V:vcf,control my_control_samples.vcf", and the resulting VCF records would contain + * the annotation "set=control" in the INFO field. It is strongly recommended to provide explicit names in this way + * when a rod priority list is provided.

+ * *

CombineVariants will emit a record for every site that was present in any of your input VCF files, and will annotate * (in the set attribute in the INFO field) whether the record had a PASS or FILTER status in each input ROD . In effect, * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the merged VCFs @@ -136,20 +143,8 @@ import java.util.*; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=-50,stop=50)) public class CombineVariants extends RodWalker implements TreeReducible { - /** - * The VCF files to merge together - * - * variants can take any number of arguments on the command line. Each -V argument - * will be included in the final merged output VCF. If no explicit name is provided, - * the -V arguments will be named using the default algorithm: variants, variants2, variants3, etc. - * The user can override this by providing an explicit name -V:name,vcf for each -V argument, - * and each named argument will be labeled as such in the output (i.e., set=name rather than - * set=variants2). The order of arguments does not matter unless except for the naming, so - * if you provide an rod priority list and no explicit names than variants, variants2, etc - * are technically order dependent. It is strongly recommended to provide explicit names when - * a rod priority list is provided. - */ - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + + @Input(fullName="variant", shortName = "V", doc="VCF files to merge together", required=true) public List> variantCollections; final private List> variants = new ArrayList<>(); @@ -167,48 +162,75 @@ public class CombineVariants extends RodWalker implements Tree public GATKVariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE; /** - * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. + * Refers to the merging priority behavior described in the tool documentation regarding the choice of which record + * gets emitted when taking the union of variants that contain genotypes. The list must be passed as a + * comma-separated string listing the names of the variant input files. The list must be complete and include all + * variant inputs that are being provided to the tool. Use name tags for best results. */ - @Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false) + @Argument(fullName="rod_priority_list", shortName="priority", doc="Ordered list specifying priority for merging", required=false) public String PRIORITY_STRING = null; - @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false) + @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Emit interesting sites requiring complex compatibility merging to file", required=false) public boolean printComplexMerges = false; - @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false) + /** + * If enabled, this flag causes filtered variants (i.e. variant records where the FILTER field is populated by + * something other than PASS or a dot) to be omitted from the output. + */ + @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="Treat filtered variants as uncalled", required=false) public boolean filteredAreUncalled = false; /** - * Used to generate a sites-only file. + * If this flag is enabled, the INFO, FORMAT and sample-level (genotype) fields will not be emitted to the output file. */ - @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) + @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="Emit a sites-only file", required=false) public boolean minimalVCF = false; - @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false) + /** + * Exclude sites that do not contain any called ALT alleles in the merged callset. The evaluation is made after the + * merging procedure is complete. + */ + @Argument(fullName="excludeNonVariants", shortName="env", doc="Exclude sites where no variation is present after merging", required=false) public boolean EXCLUDE_NON_VARIANTS = false; /** - * Set to 'null' if you don't want the set field emitted. + * Key used in the INFO key=value tag emitted describing which set(s) the combined record came from + * (e.g. set=control). This provides the option to override the default naming, so instead of set=control you could + * have it be origin=control, or any other word you want that is not already an INFO field attribute. Set this to + * 'null' if you don't want the set attribute emitted at all. */ - @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) + @Argument(fullName="setKey", shortName="setKey", doc="Key name for the set attribute", required=false) public String SET_KEY = "set"; /** - * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime. + * This option allows you to perform a simple merge (concatenation) to combine the VCFs, drastically reducing + * runtime. Note that in many cases where you think you want to use this option, you may want to check out the + * CatVariants tool instead, because CatVariants provides the same functionality, but does so even more efficiently. */ - @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) + @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="Assume input VCFs have identical sample sets and disjoint calls", required=false) public boolean ASSUME_IDENTICAL_SAMPLES = false; - @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) + /** + * Sites that are present in fewer than this number of inputs will be ignored. This is a convenient way to build + * a collection of common variants and exclude rare variants. + */ + @Argument(fullName="minimumN", shortName="minN", doc="Minimum number of input files the site must be observed in to be included", required=false) public int minimumN = 1; /** - * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs. + * By default, this tool writes the command line that was used in the header of the output VCF file. This flag + * enables you to override that behavior . This is most often useful when combining variants for dozens or + * hundreds of smaller VCFs iteratively, to avoid cluttering the header with a lot of command lines. */ - @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false) + @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="Do not output the command line to the header", required=false) public boolean SUPPRESS_COMMAND_LINE_HEADER = false; - @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) + /** + * By default, the INFO field of the merged variant record only contains the INFO field attributes for which all + * original overlapping records had the same values. Discordant attributes are therefore discarded. This flag allows you to + * override that behavior and simply copy over the INFO field contents of whichever record had the highest AC value. + */ + @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="Use the INFO content of the record with the highest AC", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; private List priority = null; @@ -224,7 +246,7 @@ public class CombineVariants extends RodWalker implements Tree sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); } else - logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); + logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites-only output option"); validateAnnotateUnionArguments(); @@ -233,7 +255,7 @@ public class CombineVariants extends RodWalker implements Tree if (genotypeMergeOption == null && !ASSUME_IDENTICAL_SAMPLES) { if (!sampleNamesAreUnique) throw new UserException("Duplicate sample names were discovered but no genotypemergeoption was supplied. " + - "To combine samples without merging specify --genotypemergeoption UNIQUIFY. Merging duplicate samples " + + "To combine samples without merging, specify --genotypemergeoption UNIQUIFY. Merging duplicate samples " + "without specified priority is unsupported, but can be achieved by specifying --genotypemergeoption UNSORTED."); else genotypeMergeOption = GATKVariantContextUtils.GenotypeMergeType.UNSORTED;