diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java
index c748f75ce..9fe3934bc 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java
@@ -53,7 +53,7 @@ import java.util.*;
* Annotate variant calls with context information
*
*
- * This tool is designed to annotate variant calls based on their context (ass opposed to functional annotation).
+ * This tool is designed to annotate variant calls based on their context (as opposed to functional annotation).
* Various annotation modules are available; see the
* documentation
* for a complete list.
@@ -95,9 +95,9 @@ public class VariantAnnotator extends RodWalker implements Ann
/**
* The INFO field will be annotated with information on the most biologically significant effect
- * listed in the SnpEff output file for each variant.
+ * listed for each variant in the SnpEff file.
*/
- @Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false)
+ @Input(fullName="snpEffFile", shortName = "snpEffFile", doc="SnpEff file from which to get annotations", required=false)
public RodBinding snpEffFile;
public RodBinding getSnpEffRodBinding() { return snpEffFile; }
@@ -114,7 +114,7 @@ public class VariantAnnotator extends RodWalker implements Ann
* Records that are filtered in the comp track will be ignored. Note that 'dbSNP' has been special-cased
* (see the --dbsnp argument).
*/
- @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
+ @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false)
public List> comps = Collections.emptyList();
public List> getCompRodBindings() { return comps; }
@@ -127,7 +127,8 @@ public class VariantAnnotator extends RodWalker implements Ann
* '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output
* VCF, any records for which there is a record at the same position in the resource file will be annotated with
* 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given
- * position, one is chosen randomly.
+ * position, one is chosen randomly. Note also that this does not currently check for allele concordance;
+ * the match is based on position only.
*/
@Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false)
public List> resources = Collections.emptyList();
@@ -184,20 +185,25 @@ public class VariantAnnotator extends RodWalker implements Ann
protected Boolean USE_ALL_ANNOTATIONS = false;
/**
- * Note that the --list argument requires a fully resolved and correct command-line to work. As an alternative, you can use ListAnnotations (see Help Utilities).
+ * Note that the --list argument requires a fully resolved and correct command-line to work. As an alternative,
+ * you can use ListAnnotations (see Help Utilities).
*/
@Argument(fullName="list", shortName="ls", doc="List the available annotations and exit", required=false)
protected Boolean LIST = false;
/**
- * By default, the dbSNP ID is added only when the ID field in the variant VCF is empty (not already annotated).
- * This argument allows you to override that behavior. This is used in conjuction with the -dbsnp argument.
+ * By default, a dbSNP ID is added only when the ID field in the variant record is empty (not already annotated).
+ * This argument allows you to override that behavior, and appends the new ID to the existing one. This is used
+ * in conjunction with the -dbsnp argument.
*/
- @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="Append the dbSNP ID even when the variant VCF already has the ID field populated", required=false)
+ @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="Add dbSNP ID even if one is already present", required=false)
protected Boolean ALWAYS_APPEND_DBSNP_ID = false;
public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; }
- @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality threshold in order to annotate mendelian violation ratio")
+ /**
+ * The genotype quality (GQ) threshold above which the mendelian violation ratio should be annotated.
+ */
+ @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="GQ threshold for annotating MV ratio")
public double minGenotypeQualityP = 0.0;
private VariantAnnotatorEngine engine;
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java
index b538225ef..61db1efb0 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java
@@ -57,9 +57,9 @@ import java.util.*;
* Combine variant records from different sources
*
* CombineVariants reads in variants records from separate ROD (Reference-Ordered Data) sources and combines them into
- * a single VCF. Any (unique) name can be used to bind your ROD and any number of sources can be input. This tool aims
- * to fulfill two main possible use cases, reflected by the two combination options (MERGE and UNION), for merging
- * records at the variant level (the first 8 fields of the VCF) or at the genotype level (the rest).
+ * a single VCF. Any number of sources can be input. This tool aims to fulfill two main possible use cases, reflected
+ * by the two combination options (MERGE and UNION), for merging records at the variant level (the first 8 fields of
+ * the VCF) or at the genotype level (the rest).
*
*
* - MERGE: combines multiple variant records present at the same site in the different input sources into a
@@ -71,6 +71,13 @@ import java.util.*;
* It uses the priority list (if provided) to emit a single record instance at every position represented in the input RODs.
*
*
+ * By default, the input sets will be named variants, variants2, variants3, and so on. You can override this by
+ * providing an explicit name tag for each input, using the syntax " -V:format,name". Each input tagged in this
+ * way will be labeled as such in the output (i.e., set=name rather than set=variants2). For example, you could specify
+ * a set of control samples as " -V:vcf,control my_control_samples.vcf", and the resulting VCF records would contain
+ * the annotation "set=control" in the INFO field. It is strongly recommended to provide explicit names in this way
+ * when a rod priority list is provided.
+ *
* CombineVariants will emit a record for every site that was present in any of your input VCF files, and will annotate
* (in the set attribute in the INFO field) whether the record had a PASS or FILTER status in each input ROD . In effect,
* CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the merged VCFs
@@ -136,20 +143,8 @@ import java.util.*;
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-50,stop=50))
public class CombineVariants extends RodWalker implements TreeReducible {
- /**
- * The VCF files to merge together
- *
- * variants can take any number of arguments on the command line. Each -V argument
- * will be included in the final merged output VCF. If no explicit name is provided,
- * the -V arguments will be named using the default algorithm: variants, variants2, variants3, etc.
- * The user can override this by providing an explicit name -V:name,vcf for each -V argument,
- * and each named argument will be labeled as such in the output (i.e., set=name rather than
- * set=variants2). The order of arguments does not matter unless except for the naming, so
- * if you provide an rod priority list and no explicit names than variants, variants2, etc
- * are technically order dependent. It is strongly recommended to provide explicit names when
- * a rod priority list is provided.
- */
- @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
+
+ @Input(fullName="variant", shortName = "V", doc="VCF files to merge together", required=true)
public List> variantCollections;
final private List> variants = new ArrayList<>();
@@ -167,48 +162,75 @@ public class CombineVariants extends RodWalker implements Tree
public GATKVariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE;
/**
- * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided.
+ * Refers to the merging priority behavior described in the tool documentation regarding the choice of which record
+ * gets emitted when taking the union of variants that contain genotypes. The list must be passed as a
+ * comma-separated string listing the names of the variant input files. The list must be complete and include all
+ * variant inputs that are being provided to the tool. Use name tags for best results.
*/
- @Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false)
+ @Argument(fullName="rod_priority_list", shortName="priority", doc="Ordered list specifying priority for merging", required=false)
public String PRIORITY_STRING = null;
- @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false)
+ @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Emit interesting sites requiring complex compatibility merging to file", required=false)
public boolean printComplexMerges = false;
- @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false)
+ /**
+ * If enabled, this flag causes filtered variants (i.e. variant records where the FILTER field is populated by
+ * something other than PASS or a dot) to be omitted from the output.
+ */
+ @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="Treat filtered variants as uncalled", required=false)
public boolean filteredAreUncalled = false;
/**
- * Used to generate a sites-only file.
+ * If this flag is enabled, the INFO, FORMAT and sample-level (genotype) fields will not be emitted to the output file.
*/
- @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false)
+ @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="Emit a sites-only file", required=false)
public boolean minimalVCF = false;
- @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false)
+ /**
+ * Exclude sites that do not contain any called ALT alleles in the merged callset. The evaluation is made after the
+ * merging procedure is complete.
+ */
+ @Argument(fullName="excludeNonVariants", shortName="env", doc="Exclude sites where no variation is present after merging", required=false)
public boolean EXCLUDE_NON_VARIANTS = false;
/**
- * Set to 'null' if you don't want the set field emitted.
+ * Key used in the INFO key=value tag emitted describing which set(s) the combined record came from
+ * (e.g. set=control). This provides the option to override the default naming, so instead of set=control you could
+ * have it be origin=control, or any other word you want that is not already an INFO field attribute. Set this to
+ * 'null' if you don't want the set attribute emitted at all.
*/
- @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false)
+ @Argument(fullName="setKey", shortName="setKey", doc="Key name for the set attribute", required=false)
public String SET_KEY = "set";
/**
- * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime.
+ * This option allows you to perform a simple merge (concatenation) to combine the VCFs, drastically reducing
+ * runtime. Note that in many cases where you think you want to use this option, you may want to check out the
+ * CatVariants tool instead, because CatVariants provides the same functionality, but does so even more efficiently.
*/
- @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false)
+ @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="Assume input VCFs have identical sample sets and disjoint calls", required=false)
public boolean ASSUME_IDENTICAL_SAMPLES = false;
- @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false)
+ /**
+ * Sites that are present in fewer than this number of inputs will be ignored. This is a convenient way to build
+ * a collection of common variants and exclude rare variants.
+ */
+ @Argument(fullName="minimumN", shortName="minN", doc="Minimum number of input files the site must be observed in to be included", required=false)
public int minimumN = 1;
/**
- * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs.
+ * By default, this tool writes the command line that was used in the header of the output VCF file. This flag
+ * enables you to override that behavior . This is most often useful when combining variants for dozens or
+ * hundreds of smaller VCFs iteratively, to avoid cluttering the header with a lot of command lines.
*/
- @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false)
+ @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="Do not output the command line to the header", required=false)
public boolean SUPPRESS_COMMAND_LINE_HEADER = false;
- @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false)
+ /**
+ * By default, the INFO field of the merged variant record only contains the INFO field attributes for which all
+ * original overlapping records had the same values. Discordant attributes are therefore discarded. This flag allows you to
+ * override that behavior and simply copy over the INFO field contents of whichever record had the highest AC value.
+ */
+ @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="Use the INFO content of the record with the highest AC", required=false)
public boolean MERGE_INFO_WITH_MAX_AC = false;
private List priority = null;
@@ -224,7 +246,7 @@ public class CombineVariants extends RodWalker implements Tree
sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES);
if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
} else
- logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
+ logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites-only output option");
validateAnnotateUnionArguments();
@@ -233,7 +255,7 @@ public class CombineVariants extends RodWalker implements Tree
if (genotypeMergeOption == null && !ASSUME_IDENTICAL_SAMPLES) {
if (!sampleNamesAreUnique)
throw new UserException("Duplicate sample names were discovered but no genotypemergeoption was supplied. " +
- "To combine samples without merging specify --genotypemergeoption UNIQUIFY. Merging duplicate samples " +
+ "To combine samples without merging, specify --genotypemergeoption UNIQUIFY. Merging duplicate samples " +
"without specified priority is unsupported, but can be achieved by specifying --genotypemergeoption UNSORTED.");
else
genotypeMergeOption = GATKVariantContextUtils.GenotypeMergeType.UNSORTED;