diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index 32132c7ca..32002e093 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -96,24 +96,23 @@ public abstract class CommandLineExecutable extends CommandLineProgram { loadArgumentsIntoObject(walker); argumentSources.add(walker); - Collection newStyle = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); + Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); // todo: remove me when the old style system is removed if ( getArgumentCollection().RODBindings.size() > 0 ) { logger.warn("################################################################################"); logger.warn("################################################################################"); - logger.warn("Deprecated -B rod binding syntax detected. This syntax will be retired in GATK 1.2."); + logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2."); logger.warn("Please use arguments defined by each specific walker instead."); for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) { logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags()); } logger.warn("################################################################################"); logger.warn("################################################################################"); + System.exit(1); } - Collection oldStyle = ListFileUtils.unpackRODBindingsOldStyle(getArgumentCollection().RODBindings, parser); - oldStyle.addAll(newStyle); - engine.setReferenceMetaDataFiles(oldStyle); + engine.setReferenceMetaDataFiles(rodBindings); for (ReadFilter filter: filters) { loadArgumentsIntoObject(filter); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java index ce638ff2b..2f4dd06e2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java @@ -39,8 +39,7 @@ import org.simpleframework.xml.*; public class DbsnpArgumentCollection { /** - * A dbSNP VCF file. Variants in this track will be treated as "known" variants - * in tools using this track. + * A dbSNP VCF file. */ @Input(fullName="dbsnp", shortName = "D", doc="dbSNP file", required=false) public RodBinding dbsnp; diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 62135f21b..fd39d46b0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -101,6 +101,8 @@ public class GATKArgumentCollection { @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) public File referenceFile = null; + @Deprecated + @Hidden @ElementList(required = false) @Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :, ", required = false) public ArrayList RODBindings = new ArrayList(); @@ -340,14 +342,6 @@ public class GATKArgumentCollection { return false; } } - if (other.RODBindings.size() != RODBindings.size()) { - return false; - } - for (int x = 0; x < RODBindings.size(); x++) { - if (!RODBindings.get(x).equals(other.RODBindings.get(x))) { - return false; - } - } if (!other.samFiles.equals(this.samFiles)) { return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 7e1dcd707..fdfac6bf7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -40,26 +40,65 @@ import java.util.TreeSet; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; + /** - * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear - * in the input file. It can dynamically merge the contents of multiple input BAM files, resulting - * in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter - * command line argument. + * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. + * + *

+ * PrintReads can dynamically merge the contents of multiple input BAM files, resulting + * in merged output sorted in coordinate order. Can also optionally filter reads based on the + * --read_filter command line argument. + * + *

Input

+ *

+ * One or more bam files. + *

+ * + *

Output

+ *

+ * A single processed bam file. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PrintReads \
+ *   -o output.bam \
+ *   -I input1.bam \
+ *   -I input2.bam \
+ *   --read_filter MappingQualityZero
+ * 
+ * */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @Requires({DataSource.READS, DataSource.REFERENCE}) public class PrintReadsWalker extends ReadWalker { - /** an optional argument to dump the reads out to a BAM file */ + @Output(doc="Write output to this BAM filename instead of STDOUT") SAMFileWriter out; + @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false) String readGroup = null; + + /** + * For example, --platform ILLUMINA or --platform 454. + */ @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false) - String platform = null; // E.g. ILLUMINA, 454 + String platform = null; + @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) int nReadsToPrint = -1; + + /** + * Only reads from samples listed in the provided file(s) will be included in the output. + */ @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) public Set sampleFile = new TreeSet(); + + /** + * Only reads from the sample(s) will be included in the output. + */ @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) public Set sampleNames = new TreeSet(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 8c8bd19d0..96a400c68 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -49,7 +49,34 @@ import java.util.*; /** - * Annotates variant calls with context information. Users can specify which of the available annotations to use. + * Annotates variant calls with context information. + * + *

+ * VariantAnnotator is a GATK tool for annotating variant calls based on their context. + * The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself. + * + *

Input

+ *

+ * A variant set to annotate and optionally one or more BAM files. + *

+ * + *

Output

+ *

+ * An annotated VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantAnnotator \
+ *   -I input.bam \
+ *   -o output.vcf \
+ *   -A DepthOfCoverage
+ *   --variant input.vcf \
+ *   --dbsnp dbsnp.vcf
+ * 
+ * */ @Requires(value={}) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @@ -69,8 +96,6 @@ public class VariantAnnotator extends RodWalker implements Ann public RodBinding getSnpEffRodBinding() { return snpEffFile; } /** - * A dbSNP VCF file from which to annotate. - * * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. */ @ArgumentCollection @@ -101,15 +126,25 @@ public class VariantAnnotator extends RodWalker implements Ann @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) - protected String sampleName = null; - + /** + * See the -list argument to view available annotations. + */ @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) protected List annotationsToUse = new ArrayList(); + /** + * See the -list argument to view available groups. + */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) protected List annotationGroupsToUse = new ArrayList(); + /** + * This option enables you to add annotations from one VCF to another. + * + * For example, if you want to annotate your 'variant' VCF with the AC field value from the rod bound to 'resource', + * you can specify '-E resource.AC' and records in the output VCF will be annotated with 'resource.AC=N' when a record exists in that rod at the given position. + * If multiple records in the rod overlap the given position, one is chosen arbitrarily. + */ @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false) protected List expressionsToUse = new ArrayList(); @@ -127,8 +162,6 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) protected boolean indelsOnly = false; - private HashMap nonVCFsampleName = new HashMap(); - private VariantAnnotatorEngine engine; private Collection indelBufferContext; @@ -164,12 +197,6 @@ public class VariantAnnotator extends RodWalker implements Ann List rodName = Arrays.asList(variantCollection.variants.getName()); Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); - // add the non-VCF sample from the command-line, if applicable - if ( sampleName != null ) { - nonVCFsampleName.put(sampleName.toUpperCase(), "variant"); - samples.add(sampleName.toUpperCase()); - } - // if there are no valid samples, warn the user if ( samples.size() == 0 ) { logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java index a4944e939..5c2a967b9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java @@ -38,12 +38,32 @@ import java.util.List; /** * Walks along reference and calculates the GC content for each interval. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * GC content calculations per interval. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T GCContentByInterval \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   -L input.intervals
+ * 
+ * */ @Allows(value = {DataSource.REFERENCE}) @Requires(value = {DataSource.REFERENCE}) - @By(DataSource.REFERENCE) - public class GCContentByIntervalWalker extends LocusWalker { @Output protected PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index a57fabc39..8f333a2b3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -40,18 +40,48 @@ import java.util.List; /** - * Generates an alternative reference sequence over the specified interval. Given variant ROD tracks, - * it replaces the reference bases at variation sites with the bases supplied by the ROD(s). Additionally, - * allows for a "snpmask" ROD to set overlapping bases to 'N'. + * Generates an alternative reference sequence over the specified interval. + * + *

+ * Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). + * Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'. + * + *

Input

+ *

+ * The reference, requested intervals, and any number of variant rod files. + *

+ * + *

Output

+ *

+ * A fasta file representing the requested intervals. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T FastaAlternateReferenceMaker \
+ *   -o output.fasta \
+ *   -L input.intervals \
+ *   --variant input.vcf \
+ *   [--snpmask mask.vcf]
+ * 
+ * */ @WalkerName("FastaAlternateReferenceMaker") @Reference(window=@Window(start=-1,stop=50)) @Requires(value={DataSource.REFERENCE}) public class FastaAlternateReferenceWalker extends FastaReferenceWalker { + /** + * Variants from these input files are used by this tool to construct an alternate reference. + */ @Input(fullName = "variant", shortName = "V", doc="variants to model", required=false) public List> variants = Collections.emptyList(); + /** + * Snps from this file are used as a mask when constructing the alternate reference. + */ @Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false) public RodBinding snpmask; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java index 2dbfc76ff..5f3b37cc8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java @@ -38,14 +38,44 @@ import org.broadinstitute.sting.utils.collections.Pair; import java.io.PrintStream; /** - * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. Has optional - * features to control the output format. + * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. + * + *

+ * The output format can be partially controlled using the provided command-line arguments. + * + *

Input

+ *

+ * The reference and requested intervals. + *

+ * + *

Output

+ *

+ * A fasta file representing the requested intervals. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T FastaReference \
+ *   -o output.fasta \
+ *   -L input.intervals
+ * 
+ * */ @WalkerName("FastaReferenceMaker") public class FastaReferenceWalker extends RefWalker, GenomeLoc> { + @Output PrintStream out; - @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) public int fastaLineWidth=60; - @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity) - CAUTION: adjacent intervals will automatically be merged", required=false) public boolean fastaRawSeqs=false; + + @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) + public int fastaLineWidth=60; + + /** + * Please note that when using this argument adjacent intervals will automatically be merged. + */ + @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false) + public boolean fastaRawSeqs=false; protected FastaSequence fasta; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index c555e88cd..bf3606b54 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -45,6 +45,34 @@ import java.util.*; /** * Filters variant calls using a number of user-selectable, parameterizable criteria. + * + *

+ * VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria. + * Records are hard-filtered by changing the value in the FILTER field to something other than PASS. + * + *

Input

+ *

+ * A variant set to filter. + *

+ * + *

Output

+ *

+ * A filtered VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantFiltration \
+ *   -o output.vcf \
+ *   --variant input.vcf \
+ *   --filterExpression "AB < 0.2 || MQ0 > 50" \
+ *   --filterName "Nov09filters" \
+ *   --mask mask.vcf \
+ *   --maskName InDel
+ * 
+ * */ @Reference(window=@Window(start=-50,stop=50)) public class VariantFiltrationWalker extends RodWalker { @@ -52,33 +80,65 @@ public class VariantFiltrationWalker extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + /** + * Any variant which overlaps entries from the provided mask rod will be filtered. + */ @Input(fullName="mask", doc="Input ROD mask", required=false) public RodBinding mask; @Output(doc="File to which variants should be written", required=true) protected VCFWriter writer = null; - @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter (see wiki docs for more info)", required=false) + /** + * VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using + * --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"). + */ + @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false) protected ArrayList FILTER_EXPS = new ArrayList(); - @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false) + + /** + * This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names. + */ + @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false) protected ArrayList FILTER_NAMES = new ArrayList(); + /** + * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. + * VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag). + * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience + * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1"). + */ @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false) protected ArrayList GENOTYPE_FILTER_EXPS = new ArrayList(); + + /** + * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. + */ @Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false) protected ArrayList GENOTYPE_FILTER_NAMES = new ArrayList(); - @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]", required=false) + /** + * Works together with the --clusterWindowSize argument. + */ + @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false) protected Integer clusterSize = 3; - @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]", required=false) + + /** + * Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1. + */ + @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false) protected Integer clusterWindow = 0; - @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered; [default:0]", required=false) + @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false) protected Integer MASK_EXTEND = 0; - @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false) + @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false) protected String MASK_NAME = "Mask"; - @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)?", required=false) + /** + * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing. + * Use this argument to have it evaluate as failing filters instead for these cases. + */ + @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false) protected Boolean FAIL_MISSING_VALUES = false; // JEXL expressions for the filters diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 029b6deaf..d766ae8bd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -69,7 +69,7 @@ import java.util.*; *

* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion - * or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index af8051334..17d5a8e9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -35,16 +35,46 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.sam.AlignmentUtils; + /** - * Left aligns indels in reads. + * Left-aligns indels from reads in a bam file. + * + *

+ * LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be + * placed at multiple positions and still represent the same haplotype. While a standard convention is to place an + * indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + * + *

Input

+ *

+ * A bam file to left-align. + *

+ * + *

Output

+ *

+ * A left-aligned bam. + *

+ * + *

Examples

+ *
+ * java -Xmx3g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T LeftAlignIndels \
+ *   -I input.bam \
+ *   -o output.vcf
+ * 
+ * */ public class LeftAlignIndels extends ReadWalker { @Output(required=false, doc="Output bam") protected StingSAMFileWriter writer = null; - @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+ - "If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false) + /** + * If set too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool + * may run out of memory. We recommend that you additionally tell Java to use a temp directory with plenty of available + * space (by setting java.io.tempdir on the command-line). + */ + @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the output writer", required=false) protected int MAX_RECORDS_IN_RAM = 500000; public void initialize() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index ac4fba4b4..34c7912d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -23,7 +23,10 @@ */ package org.broadinstitute.sting.gatk.walkers.phasing; -import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -49,16 +52,46 @@ import java.util.*; import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - /** * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). + * + *

+ * Performs physical phasing of SNP calls, based on sequencing reads. + *

+ * + *

Input

+ *

+ * VCF file of SNP calls, BAM file of sequence reads. + *

+ * + *

Output

+ *

+ * Phased VCF file. + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ReadBackedPhasing
+ *      -R reference.fasta
+ *      -I reads.bam
+ *      --variant:vcf SNPs.vcf
+ *      -BTI variant
+ *      -BTIMR INTERSECTION
+ *      -o phased_SNPs.vcf
+ *      --phaseQualityThresh 20.0
+ * 
+ * + * @author Menachem Fromer + * @since July 2010 */ @Allows(value = {DataSource.READS, DataSource.REFERENCE}) @Requires(value = {DataSource.READS, DataSource.REFERENCE}) @By(DataSource.READS) -@ReadFilters({MappingQualityZeroReadFilter.class}) // Filter out all reads with zero mapping quality +@ReadFilters({MappingQualityZeroReadFilter.class}) public class ReadBackedPhasingWalker extends RodWalker { private static final boolean DEBUG = false; @@ -73,13 +106,13 @@ public class ReadBackedPhasingWalker extends RodWalker P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9 @Hidden @@ -87,10 +120,10 @@ public class ReadBackedPhasingWalker extends RodWalker * Simplest example of a locus walker. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of loci traversed. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountLoci \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ * */ public class CountLociWalker extends LocusWalker implements TreeReducible { @Output(doc="Write count to this file instead of STDOUT") diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java index 26fa9a258..e770418c1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java @@ -39,6 +39,26 @@ import java.util.List; * query name order. Breaks counts down by total pairs and number * of paired reads. * + * + *

Input

+ *

+ * One or more bam files. + *

+ * + *

Output

+ *

+ * Number of pairs seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountPairs \
+ *   -o output.txt \
+ *   -I input.bam
+ * 
+ * * @author mhanna */ public class CountPairsWalker extends ReadPairWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java similarity index 62% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java index d1545f159..7c7d6417a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java @@ -25,7 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -33,25 +36,55 @@ import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.collections.Pair; +import java.util.Collections; +import java.util.List; + /** - * Prints out counts of the number of reference ordered data objects are - * each locus for debugging RefWalkers. + * Prints out counts of the number of reference ordered data objects encountered. + * + * + *

Input

+ *

+ * One or more rod files. + *

+ * + *

Output

+ *

+ * Number of rods seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountRODsByRef \
+ *   -o output.txt \
+ *   --rod input.vcf
+ * 
+ * */ -public class CountRodByRefWalker extends RefWalker, Long>> { - @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false) +public class CountRODsByRefWalker extends RefWalker, Long>> { + + /** + * One or more input rod files + */ + @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false) + public List> rods = Collections.emptyList(); + + @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false) public boolean verbose = false; - @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false) + @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false) public boolean showSkipped = false; - CountRodWalker crw = new CountRodWalker(); + CountRODsWalker crw = new CountRODsWalker(); public void initialize() { crw.verbose = verbose; crw.showSkipped = showSkipped; } - public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public CountRODsWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return crw.map(tracker, ref, context); } @@ -59,7 +92,7 @@ public class CountRodByRefWalker extends RefWalker, Long> reduce(CountRodWalker.Datum point, Pair, Long> sum) { + public Pair, Long> reduce(CountRODsWalker.Datum point, Pair, Long> sum) { return crw.reduce(point, sum); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java similarity index 87% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java index 8a03dea44..edbd5ff75 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java @@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.qc; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -41,23 +44,46 @@ import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.collections.Pair; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** - * Prints out counts of the number of reference ordered data objects are - * each locus for debugging RodWalkers. + * Prints out counts of the number of reference ordered data objects encountered. + * + * + *

Input

+ *

+ * One or more rod files. + *

+ * + *

Output

+ *

+ * Number of rods seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountRODs \
+ *   -o output.txt \
+ *   --rod input.vcf
+ * 
+ * */ -public class CountRodWalker extends RodWalker, Long>> implements TreeReducible, Long>> { +public class CountRODsWalker extends RodWalker, Long>> implements TreeReducible, Long>> { @Output public PrintStream out; - @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false) + /** + * One or more input rod files + */ + @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false) + public List> rods = Collections.emptyList(); + + @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false) public boolean verbose = false; - @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false) + @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false) public boolean showSkipped = false; @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java index 87c0409b9..9ce9c4eec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java @@ -9,8 +9,32 @@ import org.broadinstitute.sting.gatk.walkers.Requires; /** * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * + *

* Can also count the number of reads matching a given criterion using read filters (see the * --read-filter command line argument). Simplest example of a read-backed analysis. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountReads \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ * */ @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountReadsWalker extends ReadWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 633c21320..d1fa3f4df 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -36,20 +36,61 @@ import java.util.*; /** * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more) + * + *

+ * Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of + * raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls + * to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics + * by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid + * degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in + * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation + * and stratification modules. + * + *

Input

+ *

+ * One or more variant sets to evaluate plus any number of comparison sets. + *

+ * + *

Output

+ *

+ * Evaluation tables. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantEval \
+ *   -o output.eval.gatkreport \
+ *   --eval:set1 set1.vcf \
+ *   --eval:set2 set2.vcf \
+ *   [--comp comp.vcf]
+ * 
+ * */ @Reference(window=@Window(start=-50, stop=50)) public class VariantEvalWalker extends RodWalker implements TreeReducible { - // Output arguments + @Output protected PrintStream out; + /** + * The variant file(s) to evaluate. + */ @Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true) public List> evals; + /** + * The variant file(s) to compare against. + */ @Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false) public List> compsProvided = Collections.emptyList(); private List> comps = new ArrayList>(); + /** + * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants. + * Other sets can be specified with the -knownName (--known_names) argument. + */ @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); @@ -67,6 +108,9 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="sample", shortName="sn", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false) protected Set SAMPLE_EXPRESSIONS; + /** + * List of rod tracks to be used for specifying "known" variants other than dbSNP. + */ @Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false) protected HashSet KNOWN_NAMES = new HashSet(); List> knowns = new ArrayList>(); @@ -81,7 +125,9 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false) protected Set typesToUse = null; - // Evaluator arguments + /** + * See the -list argument to view available modules. + */ @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false) protected String[] MODULES_TO_USE = {}; @@ -95,7 +141,10 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false) protected double MIN_PHASE_QUALITY = 10.0; - @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + /** + * This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined. + */ + @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false) protected String FAMILY_STRUCTURE; @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index c47a015c6..9fae71e4e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -46,6 +46,31 @@ import java.util.*; /** * Left-aligns indels from a variants file. + * + *

+ * LeftAlignVariants is a tool that takes a VCF file and left-aligns any indels inside it. The same indel can often be + * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to + * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + * + *

Input

+ *

+ * A variant set to left-align. + *

+ * + *

Output

+ *

+ * A left-aligned VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T LeftAlignVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf
+ * 
+ * */ @Reference(window=@Window(start=-200,stop=200)) public class LeftAlignVariants extends RodWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index 5c7fb268c..01a6e2f70 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; import org.broad.tribble.dbsnp.DbSNPFeature; import org.broadinstitute.sting.commandline.*; @@ -34,7 +33,6 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -48,7 +46,32 @@ import java.util.Set; /** - * Validates a variants file. + * Strictly validates a variants file. + * + *

+ * ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it. + * Checks include the correctness of the reference base(s), accuracy of AC & AN values, tests against rsIDs + * when a dbSNP file is provided, and that all alternate alleles are present in at least one sample. + * + *

Input

+ *

+ * A variant set to filter. + *

+ * + *

Output

+ *

+ * A filtered VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ValidateVariants \
+ *   --variant input.vcf \
+ *   --dbsnp dbsnp.vcf
+ * 
+ * */ @Reference(window=@Window(start=0,stop=100)) public class ValidateVariants extends RodWalker { @@ -67,10 +90,13 @@ public class ValidateVariants extends RodWalker { @Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false) protected ValidationType type = ValidationType.ALL; - @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "should we skip validation on filtered records?", required = false) + /** + * By default, even filtered records are validated. + */ + @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false) protected Boolean DO_NOT_VALIDATE_FILTERED = false; - @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "should we just emit warnings on errors instead of terminating the run?", required = false) + @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false) protected Boolean WARN_ON_ERROR = false; private long numErrors = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index 6ed0bbd16..b98646270 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -25,10 +25,8 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -43,21 +41,57 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; /** - * Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes) + * Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes) + * + *

+ * The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes). + * The tool produces a VCF that is annotated with information pertaining to plate quality control and by + * default is soft-filtered by high no-call rate or low Hardy-Weinberg probability. + * If you have .ped files, please first convert them to VCF format + * (see http://www.broadinstitute.org/gsa/wiki/index.php/Converting_ped_to_vcf). + * + *

Input

+ *

+ * A validation VCF to annotate. + *

+ * + *

Output

+ *

+ * An annotated VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantValidationAssessor \
+ *   --variant input.vcf \
+ *   -o output.vcf
+ * 
+ * */ @Reference(window=@Window(start=0,stop=40)) public class VariantValidationAssessor extends RodWalker { - @Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true) - public RodBinding variants; + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfwriter = null; - @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false) + @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid", required=false) protected double maxHardy = 20.0; - @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid [default:0.05]", required=false) + + /** + * To disable, set to a value greater than 1. + */ + @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid", required=false) protected double maxNoCall = 0.05; - @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false) + + /** + * To disable, set to a value greater than 1. + */ + @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid", required=false) protected double maxHomNonref = 1.1; //@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+ @@ -93,7 +127,7 @@ public class VariantValidationAssessor extends RodWalker if ( tracker == null ) return null; - VariantContext vc = tracker.getFirstValue(variants, ref.getLocus()); + VariantContext vc = tracker.getFirstValue(variantCollection.variants, ref.getLocus()); // ignore places where we don't have a variant if ( vc == null ) return null; @@ -101,7 +135,7 @@ public class VariantValidationAssessor extends RodWalker if ( sampleNames == null ) sampleNames = new TreeSet(vc.getSampleNames()); - return addVariantInformationToCall(ref, vc); + return addVariantInformationToCall(vc); } public Integer reduce(VariantContext call, Integer numVariants) { @@ -113,7 +147,7 @@ public class VariantValidationAssessor extends RodWalker } public void onTraversalDone(Integer finalReduce) { - final List inputNames = Arrays.asList(variants.getName()); + final List inputNames = Arrays.asList(variantCollection.variants.getName()); // setup the header fields Set hInfo = new HashSet(); @@ -159,7 +193,7 @@ public class VariantValidationAssessor extends RodWalker } - private VariantContext addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) { + private VariantContext addVariantInformationToCall(VariantContext vContext) { // check possible filters double hwPvalue = hardyWeinbergCalculation(vContext); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index af3593ce4..b3fd540bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -40,95 +40,105 @@ import java.io.PrintStream; import java.util.*; /** - * Emits specific fields as dictated by the user from one or more VCF files. + * Emits specific fields from a VCF file to a tab-deliminated table + * + *

+ * This walker accepts a single VCF file and writes out user-selected fields from the + * VCF as a header-containing, tab-deliminated file. The user specifies one or more + * fields to print with the -F NAME, each of which appears as a single column in + * the output file, with a header named NAME, and the value of this field in the VCF + * one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding + * in the INFO field (AC=10). Note that this tool does not support capturing any + * GENOTYPE field values. If a VCF record is missing a value, then the tool by + * default throws an error, but the special value NA can be emitted instead with + * appropriate tool arguments. + * + *

+ * + *

Input

+ *

+ *

    + *
  • A VCF file
  • + *
  • A list of -F fields to write
  • + *
+ *

+ * + *

Output

+ *

+ * A table deliminated file containing the values of the requested fields in the VCF file + *

+ * + *

Examples

+ *
+ *     -T $WalkerName \
+ *     -V file.vcf \
+ *     -F CHROM -F POS -F ID -F QUAL -F AC \
+ *     -o results.table
+ *
+ *     would produce a file that looks like:
+ *
+ *     CHROM    POS ID      QUAL    AC
+ *     1        10  .       50      1
+ *     1        20  rs10    99      10
+ *     et cetera...
+ * 
+ * + * @author Mark DePristo + * @since 2010 */ public class VariantsToTable extends RodWalker { - @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @Output(doc="File to which results should be written",required=true) protected PrintStream out; - @Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true) - public ArrayList fieldsToTake = new ArrayList(); + /** + * -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10). + * Note that this tool does not support capturing any GENOTYPE field values. Note this argument + * accepts any number of inputs. So -F CHROM -F POS is allowed. + */ + @Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true) + public List fieldsToTake = new ArrayList(); - @Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records") + /** + * By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered). + * Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value. + */ + @Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false) public boolean showFiltered = false; - @Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false) + /** + * If provided, then this tool will exit with success after this number of records have been emitted to the file. + */ + @Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false) public int MAX_RECORDS = -1; int nRecords = 0; + /** + * By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then + * VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this + * can make your resulting file unreadable and malformated according to tools like R, as the representation of + * multi-allelic INFO field values can be lists of values. + */ @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) public boolean keepMultiAllelic = false; + /** + * By default, this tool throws a UserException when it encounters a field without a value in some record. This + * is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being + * found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such + * fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument + * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error. + */ @Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false) public boolean ALLOW_MISSING_DATA = false; public void initialize() { + // print out the header out.println(Utils.join("\t", fieldsToTake)); } - public static abstract class Getter { public abstract String get(VariantContext vc); } - public static Map getters = new HashMap(); - - static { - // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT - getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); - getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); - getters.put("REF", new Getter() { - public String get(VariantContext vc) { - String x = ""; - if ( vc.hasReferenceBaseForIndel() ) { - Byte refByte = vc.getReferenceBaseForIndel(); - x=x+new String(new byte[]{refByte}); - } - return x+vc.getReference().getDisplayString(); - } - }); - getters.put("ALT", new Getter() { - public String get(VariantContext vc) { - StringBuilder x = new StringBuilder(); - int n = vc.getAlternateAlleles().size(); - if ( n == 0 ) return "."; - if ( vc.hasReferenceBaseForIndel() ) { - Byte refByte = vc.getReferenceBaseForIndel(); - x.append(new String(new byte[]{refByte})); - } - - for ( int i = 0; i < n; i++ ) { - if ( i != 0 ) x.append(","); - x.append(vc.getAlternateAllele(i).getDisplayString()); - } - return x.toString(); - } - }); - getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); - getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { - if ( vc.isSNP() && vc.isBiallelic() ) - return VariantContextUtils.isTransition(vc) ? "1" : "0"; - else - return "-1"; - }}); - getters.put("FILTER", new Getter() { public String get(VariantContext vc) { - return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } - }); - - getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); - getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); - getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); - getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); - getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); - getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); - getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); - getters.put("GQ", new Getter() { public String get(VariantContext vc) { - if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); - return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); - }}); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) // RodWalkers can make funky map calls return 0; @@ -155,6 +165,15 @@ public class VariantsToTable extends RodWalker { return s.endsWith("*"); } + /** + * Utility function that returns the list of values for each field in fields from vc. + * + * @param vc the VariantContext whose field values we can to capture + * @param fields a non-null list of fields to capture from VC + * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise + * provides a value of NA + * @return + */ public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { List vals = new ArrayList(); @@ -213,13 +232,75 @@ public class VariantsToTable extends RodWalker { return vals; } - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - + // + // default reduce -- doesn't do anything at all + // + public Integer reduceInit() { return 0; } + public Integer reduce(Integer counter, Integer sum) { return counter + sum; } public void onTraversalDone(Integer sum) {} + + // ---------------------------------------------------------------------------------------------------- + // + // static system for getting values from VC by name. + // + // ---------------------------------------------------------------------------------------------------- + + public static abstract class Getter { public abstract String get(VariantContext vc); } + public static Map getters = new HashMap(); + + static { + // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT + getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); + getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); + getters.put("REF", new Getter() { + public String get(VariantContext vc) { + String x = ""; + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x=x+new String(new byte[]{refByte}); + } + return x+vc.getReference().getDisplayString(); + } + }); + getters.put("ALT", new Getter() { + public String get(VariantContext vc) { + StringBuilder x = new StringBuilder(); + int n = vc.getAlternateAlleles().size(); + if ( n == 0 ) return "."; + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x.append(new String(new byte[]{refByte})); + } + + for ( int i = 0; i < n; i++ ) { + if ( i != 0 ) x.append(","); + x.append(vc.getAlternateAllele(i).getDisplayString()); + } + return x.toString(); + } + }); + getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); + getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { + if ( vc.isSNP() && vc.isBiallelic() ) + return VariantContextUtils.isTransition(vc) ? "1" : "0"; + else + return "-1"; + }}); + getters.put("FILTER", new Getter() { public String get(VariantContext vc) { + return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } + }); + + getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); + getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); + getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); + getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); + getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); + getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); + getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); + getters.put("GQ", new Getter() { public String get(VariantContext vc) { + if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); + return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); + }}); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 1684dccfb..61851abe2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -53,6 +53,30 @@ import java.util.*; /** * Converts variants from other file formats to VCF format. + * + *

+ * Note that there must be a Tribble feature/codec for the file format as well as an adaptor. + * + *

Input

+ *

+ * A variant file to filter. + *

+ * + *

Output

+ *

+ * A VCF file. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantsToVCF \
+ *   -o output.vcf \
+ *   --variant:RawHapMap input.hapmap \
+ *   --dbsnp dbsnp.vcf
+ * 
+ * */ @Reference(window=@Window(start=-40,stop=40)) public class VariantsToVCF extends RodWalker { @@ -61,15 +85,24 @@ public class VariantsToVCF extends RodWalker { protected VCFWriter baseWriter = null; private SortingVCFWriter vcfwriter; // needed because hapmap/dbsnp indel records move + /** + * Variants from this input file are used by this tool as input. + */ @Input(fullName="variant", shortName = "V", doc="Input variant file", required=true) public RodBinding variants; @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false) + /** + * This argument is used for data (like GELI) with genotypes but no sample names encoded within. + */ + @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod", required=false) protected String sampleName = null; + /** + * This argument is useful for fixing input VCFs with bad reference bases (the output will be a fixed version of the VCF). + */ @Argument(fullName="fixRef", shortName="fixRef", doc="Fix common reference base in case there's an indel without padding", required=false) protected boolean fixReferenceBase = false; diff --git a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java index f3e868474..3a242cb13 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java @@ -88,14 +88,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest { collect.intervals.add("intervals".toLowerCase()); collect.excludeIntervals = new ArrayList(); collect.numberOfThreads = 1; - - // make some rod bindings up - ArrayList fakeBindings = new ArrayList(); - fakeBindings.add("Bind1"); - fakeBindings.add("Bind2"); - fakeBindings.add("Bind3"); - - collect.RODBindings = fakeBindings; }