Adding docs to the 2 beasts. Saved the worst for last.

This commit is contained in:
Eric Banks 2011-08-17 14:19:14 -04:00
parent c405a75f54
commit 2f19046f0c
3 changed files with 244 additions and 36 deletions

View File

@ -43,10 +43,54 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
* Combines VCF records from different sources; supports both full merges and set unions.
* Combines VCF records from different sources.
*
* <p>
* CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data
* and any number of sources can be input. This tool currently supports two different combination types for each of
* variants (the first 8 fields of the VCF) and genotypes (the rest).
* Merge: combines multiple records into a single one; if sample names overlap then they are uniquified.
* Union: assumes each rod represents the same set of samples (although this is not enforced); using the
* priority list (if provided), emits a single record instance at every position represented in the rods.
* priority list (if provided), it emits a single record instance at every position represented in the rods.
*
* CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD
* bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect,
* CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs
* can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just
* the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single
* VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
* in the detailed example on the wiki.
*
* <h2>Input</h2>
* <p>
* One or more variant sets to combine.
* </p>
*
* <h2>Output</h2>
* <p>
* A combined VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CombineVariants \
* --variant input1.vcf \
* --variant input2.vcf \
* -o output.vcf \
* -genotypeMergeOptions UNIQUIFY
*
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CombineVariants \
* --variant:foo input1.vcf \
* --variant:bar input2.vcf \
* -o output.vcf \
* -genotypeMergeOptions PRIORITIZE
* -priority foo,bar
* </pre>
*
*/
@Reference(window=@Window(start=-50,stop=50))
public class CombineVariants extends RodWalker<Integer, Integer> {
@ -69,32 +113,43 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
// the types of combinations we currently allow
@Argument(shortName="genotypeMergeOptions", doc="How should we merge genotype records for samples shared across the ROD files?", required=false)
@Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false)
public VariantContextUtils.GenotypeMergeType genotypeMergeOption = VariantContextUtils.GenotypeMergeType.PRIORITIZE;
@Argument(shortName="filteredRecordsMergeType", doc="How should we deal with records seen at the same site in the VCF, but with different FILTER fields? KEEP_IF_ANY_UNFILTERED PASSes the record if any record is unfiltered, KEEP_IF_ALL_UNFILTERED requires all records to be unfiltered", required=false)
@Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false)
public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED;
@Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=false)
/**
* Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided.
*/
@Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false)
public String PRIORITY_STRING = null;
@Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false)
public boolean printComplexMerges = false;
@Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF", required=false)
@Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false)
public boolean filteredAreUncalled = false;
@Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype INFO field", required=false)
/**
* Used to generate a sites-only file.
*/
@Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false)
public boolean minimalVCF = false;
@Argument(fullName="setKey", shortName="setKey", doc="Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from. Set to null if you don't want the set field emitted.", required=false)
/**
* Set to 'null' if you don't want the set field emitted.
*/
@Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false)
public String SET_KEY = "set";
@Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls so that one can simply perform a merge sort to combine the VCFs into one, drastically reducing the runtime.", required=false)
/**
* This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime..
*/
@Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false)
public boolean ASSUME_IDENTICAL_SAMPLES = false;
@Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if variant is present in at least N input files.", required=false)
@Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false)
public int minimumN = 1;
@Hidden

View File

@ -50,54 +50,172 @@ import java.io.PrintStream;
import java.util.*;
/**
* Takes a VCF file, selects variants based on sample(s) in which it was found and/or on various annotation criteria,
* recompute the value of certain annotations based on the new sample set, and output a new VCF with the results.
* Selects variants from a VCF source.
*
* <p>
* Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses
* (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain
* requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose.
* Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a
* pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of
* coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are
* documented in the Using JEXL expressions section (http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions).
* One can optionally include concordance or discordance tracks for use in selecting overlapping variants.
*
* <h2>Input</h2>
* <p>
* A variant set to select from.
* </p>
*
* <h2>Output</h2>
* <p>
* A selected VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* Select two samples out of a VCF with many samples:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn SAMPLE_A_PARC \
* -sn SAMPLE_B_ACTG
*
* Select two samples and any sample that matches a regular expression:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn SAMPLE_1_PARC \
* -sn SAMPLE_1_ACTG \
* -sn 'SAMPLE.+PARC'
*
* Select any sample that matches a regular expression and sites where the QD annotation is more than 10:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn 'SAMPLE.+PARC'
* -select "QD > 10.0"
*
* Select a sample and exclude non-variant loci and filtered loci:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn SAMPLE_1_ACTG \
* -env \
* -ef
*
* Select a sample and restrict the output vcf to a set of intervals:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -L /path/to/my.interval_list \
* -sn SAMPLE_1_ACTG
*
* Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called by this dataset):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant hapmap.vcf \
* --discordance myCalls.vcf
* -o output.vcf \
* -sn mySample
*
* Select all calls made by both myCalls and hisCalls (useful to take a look at what is consistent between the two callers):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant myCalls.vcf \
* --concordance hisCalls.vcf
* -o output.vcf \
* -sn mySample
*
* Generating a VCF of all the variants that are mendelian violations:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -SM family.yaml \
* -family NA12891+NA12892=NA12878 \
* -mvq 50
*
* Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -number 1000
*
* Creating a set with 50% of the total number of variants in the variant VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -fraction 0.5
*
* </pre>
*
*/
public class SelectVariants extends RodWalker<Integer, Integer> {
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* A site is considered discordant if there exists some sample in eval that has a non-reference genotype
* A site is considered discordant if there exists some sample in the variant track that has a non-reference genotype
* and either the site isn't present in this track, the sample isn't present in this track,
* or the sample is called reference in this track.
*/
@Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this Feature comparison track", required=false)
@Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false)
private RodBinding<VariantContext> discordanceTrack;
/**
* A site is considered concordant if (1) we are not looking for specific samples and there is a variant called
* in both variants and concordance tracks or (2) every sample present in eval is present in the concordance
* track and they have the sample genotype call.
* in both the variant and concordance tracks or (2) every sample present in the variant track is present in the
* concordance track and they have the sample genotype call.
*/
@Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this Feature comparison track", required=false)
@Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false)
private RodBinding<VariantContext> concordanceTrack;
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
@Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
public Set<String> sampleNames;
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times.", required=false)
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
public Set<String> sampleExpressions;
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
public Set<File> sampleFiles;
@Argument(shortName="select", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false)
/**
* Note that thse expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated.
*/
@Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false)
public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<String>();
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false)
private boolean EXCLUDE_NON_VARIANTS = false;
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis.", required=false)
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
private boolean EXCLUDE_FILTERED = false;
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't include filtered loci.", required=false)
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
@Hidden
@Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
@Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false)
private boolean KEEP_AF_SPECTRUM = false;
@Hidden
@ -108,30 +226,43 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private File FAMILY_STRUCTURE_FILE = null;
@Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
/**
* String formatted as dad+mom=child where these parameters determine which sample names are examined.
*/
@Argument(fullName="family_structure", shortName="family", doc="Deprecated; use the -SM argument instead", required=false)
private String FAMILY_STRUCTURE = "";
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only. Sample metadata information will be taken from YAML file (passed with -SM)", required=false)
/**
* Sample metadata information will be taken from a YAML file (see the -SM argument).
*/
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false)
private Boolean MENDELIAN_VIOLATIONS = false;
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track. Variants are kept in memory to guarantee that n variants will be output, so use it only for a reasonable number of variants. Use select_random_fraction for larger numbers of variants", required=false)
/**
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so use it only for a reasonable
* number of variants. Use --select_random_fraction for larger numbers of variants.
*/
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
private int numRandom = 0;
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track. Routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions", required=false)
/**
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
*/
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
private double fractionRandom = 0;
@Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs.", required=false)
@Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false)
private boolean SELECT_SNPS = false;
@Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false)
@Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false)
private boolean SELECT_INDELS = false;
@Hidden
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String outMVFile = null;
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String outMVFile = null;
/* Private class used to store the intermediate variants in the integer random selection process */
private class RandomVariantStructure {

View File

@ -345,11 +345,33 @@ public class VariantContextUtils {
}
public enum GenotypeMergeType {
UNIQUIFY, PRIORITIZE, UNSORTED, REQUIRE_UNIQUE
/**
* Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD.
*/
UNIQUIFY,
/**
* Take genotypes in priority order (see the priority argument).
*/
PRIORITIZE,
/**
* Take the genotypes in any order.
*/
UNSORTED,
/**
* Require that all samples/genotypes be unique between all inputs.
*/
REQUIRE_UNIQUE
}
public enum FilteredRecordMergeType {
KEEP_IF_ANY_UNFILTERED, KEEP_IF_ALL_UNFILTERED
/**
* Union - leaves the record if any record is unfiltered.
*/
KEEP_IF_ANY_UNFILTERED,
/**
* Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this.
*/
KEEP_IF_ALL_UNFILTERED
}
/**