diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
index 32132c7ca..32002e093 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
@@ -96,24 +96,23 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
loadArgumentsIntoObject(walker);
argumentSources.add(walker);
- Collection newStyle = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
+ Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
// todo: remove me when the old style system is removed
if ( getArgumentCollection().RODBindings.size() > 0 ) {
logger.warn("################################################################################");
logger.warn("################################################################################");
- logger.warn("Deprecated -B rod binding syntax detected. This syntax will be retired in GATK 1.2.");
+ logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2.");
logger.warn("Please use arguments defined by each specific walker instead.");
for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) {
logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags());
}
logger.warn("################################################################################");
logger.warn("################################################################################");
+ System.exit(1);
}
- Collection oldStyle = ListFileUtils.unpackRODBindingsOldStyle(getArgumentCollection().RODBindings, parser);
- oldStyle.addAll(newStyle);
- engine.setReferenceMetaDataFiles(oldStyle);
+ engine.setReferenceMetaDataFiles(rodBindings);
for (ReadFilter filter: filters) {
loadArgumentsIntoObject(filter);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java
index ce638ff2b..2f4dd06e2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java
@@ -39,8 +39,7 @@ import org.simpleframework.xml.*;
public class DbsnpArgumentCollection {
/**
- * A dbSNP VCF file. Variants in this track will be treated as "known" variants
- * in tools using this track.
+ * A dbSNP VCF file.
*/
@Input(fullName="dbsnp", shortName = "D", doc="dbSNP file", required=false)
public RodBinding dbsnp;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 62135f21b..fd39d46b0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -101,6 +101,8 @@ public class GATKArgumentCollection {
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
+ @Deprecated
+ @Hidden
@ElementList(required = false)
@Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :, ", required = false)
public ArrayList RODBindings = new ArrayList();
@@ -340,14 +342,6 @@ public class GATKArgumentCollection {
return false;
}
}
- if (other.RODBindings.size() != RODBindings.size()) {
- return false;
- }
- for (int x = 0; x < RODBindings.size(); x++) {
- if (!RODBindings.get(x).equals(other.RODBindings.get(x))) {
- return false;
- }
- }
if (!other.samFiles.equals(this.samFiles)) {
return false;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
index 7e1dcd707..fdfac6bf7 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
@@ -40,26 +40,65 @@ import java.util.TreeSet;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+
/**
- * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear
- * in the input file. It can dynamically merge the contents of multiple input BAM files, resulting
- * in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter
- * command line argument.
+ * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
+ *
+ *
+ * PrintReads can dynamically merge the contents of multiple input BAM files, resulting
+ * in merged output sorted in coordinate order. Can also optionally filter reads based on the
+ * --read_filter command line argument.
+ *
+ *
Input
+ *
+ * One or more bam files.
+ *
+ *
+ * Output
+ *
+ * A single processed bam file.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T PrintReads \
+ * -o output.bam \
+ * -I input1.bam \
+ * -I input2.bam \
+ * --read_filter MappingQualityZero
+ *
+ *
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@Requires({DataSource.READS, DataSource.REFERENCE})
public class PrintReadsWalker extends ReadWalker {
- /** an optional argument to dump the reads out to a BAM file */
+
@Output(doc="Write output to this BAM filename instead of STDOUT")
SAMFileWriter out;
+
@Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false)
String readGroup = null;
+
+ /**
+ * For example, --platform ILLUMINA or --platform 454.
+ */
@Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false)
- String platform = null; // E.g. ILLUMINA, 454
+ String platform = null;
+
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
int nReadsToPrint = -1;
+
+ /**
+ * Only reads from samples listed in the provided file(s) will be included in the output.
+ */
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
public Set sampleFile = new TreeSet();
+
+ /**
+ * Only reads from the sample(s) will be included in the output.
+ */
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
public Set sampleNames = new TreeSet();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index 8c8bd19d0..96a400c68 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -49,7 +49,34 @@ import java.util.*;
/**
- * Annotates variant calls with context information. Users can specify which of the available annotations to use.
+ * Annotates variant calls with context information.
+ *
+ *
+ * VariantAnnotator is a GATK tool for annotating variant calls based on their context.
+ * The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself.
+ *
+ *
Input
+ *
+ * A variant set to annotate and optionally one or more BAM files.
+ *
+ *
+ * Output
+ *
+ * An annotated VCF.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T VariantAnnotator \
+ * -I input.bam \
+ * -o output.vcf \
+ * -A DepthOfCoverage
+ * --variant input.vcf \
+ * --dbsnp dbsnp.vcf
+ *
+ *
*/
@Requires(value={})
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@@ -69,8 +96,6 @@ public class VariantAnnotator extends RodWalker implements Ann
public RodBinding getSnpEffRodBinding() { return snpEffFile; }
/**
- * A dbSNP VCF file from which to annotate.
- *
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
*/
@ArgumentCollection
@@ -101,15 +126,25 @@ public class VariantAnnotator extends RodWalker implements Ann
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
- @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
- protected String sampleName = null;
-
+ /**
+ * See the -list argument to view available annotations.
+ */
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected List annotationsToUse = new ArrayList();
+ /**
+ * See the -list argument to view available groups.
+ */
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected List annotationGroupsToUse = new ArrayList();
+ /**
+ * This option enables you to add annotations from one VCF to another.
+ *
+ * For example, if you want to annotate your 'variant' VCF with the AC field value from the rod bound to 'resource',
+ * you can specify '-E resource.AC' and records in the output VCF will be annotated with 'resource.AC=N' when a record exists in that rod at the given position.
+ * If multiple records in the rod overlap the given position, one is chosen arbitrarily.
+ */
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false)
protected List expressionsToUse = new ArrayList();
@@ -127,8 +162,6 @@ public class VariantAnnotator extends RodWalker implements Ann
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;
- private HashMap nonVCFsampleName = new HashMap();
-
private VariantAnnotatorEngine engine;
private Collection indelBufferContext;
@@ -164,12 +197,6 @@ public class VariantAnnotator extends RodWalker implements Ann
List rodName = Arrays.asList(variantCollection.variants.getName());
Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
- // add the non-VCF sample from the command-line, if applicable
- if ( sampleName != null ) {
- nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
- samples.add(sampleName.toUpperCase());
- }
-
// if there are no valid samples, warn the user
if ( samples.size() == 0 ) {
logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
index a4944e939..5c2a967b9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
@@ -38,12 +38,32 @@ import java.util.List;
/**
* Walks along reference and calculates the GC content for each interval.
+ *
+ *
+ * Input
+ *
+ * One or more BAM files.
+ *
+ *
+ * Output
+ *
+ * GC content calculations per interval.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T GCContentByInterval \
+ * -o output.txt \
+ * -I input.bam \
+ * -L input.intervals
+ *
+ *
*/
@Allows(value = {DataSource.REFERENCE})
@Requires(value = {DataSource.REFERENCE})
-
@By(DataSource.REFERENCE)
-
public class GCContentByIntervalWalker extends LocusWalker {
@Output
protected PrintStream out;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
index a57fabc39..8f333a2b3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
@@ -40,18 +40,48 @@ import java.util.List;
/**
- * Generates an alternative reference sequence over the specified interval. Given variant ROD tracks,
- * it replaces the reference bases at variation sites with the bases supplied by the ROD(s). Additionally,
- * allows for a "snpmask" ROD to set overlapping bases to 'N'.
+ * Generates an alternative reference sequence over the specified interval.
+ *
+ *
+ * Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
+ * Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
+ *
+ *
Input
+ *
+ * The reference, requested intervals, and any number of variant rod files.
+ *
+ *
+ * Output
+ *
+ * A fasta file representing the requested intervals.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T FastaAlternateReferenceMaker \
+ * -o output.fasta \
+ * -L input.intervals \
+ * --variant input.vcf \
+ * [--snpmask mask.vcf]
+ *
+ *
*/
@WalkerName("FastaAlternateReferenceMaker")
@Reference(window=@Window(start=-1,stop=50))
@Requires(value={DataSource.REFERENCE})
public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
+ /**
+ * Variants from these input files are used by this tool to construct an alternate reference.
+ */
@Input(fullName = "variant", shortName = "V", doc="variants to model", required=false)
public List> variants = Collections.emptyList();
+ /**
+ * Snps from this file are used as a mask when constructing the alternate reference.
+ */
@Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false)
public RodBinding snpmask;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
index 2dbfc76ff..5f3b37cc8 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
@@ -38,14 +38,44 @@ import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
/**
- * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. Has optional
- * features to control the output format.
+ * Renders a new reference in FASTA format consisting of only those loci provided in the input data set.
+ *
+ *
+ * The output format can be partially controlled using the provided command-line arguments.
+ *
+ *
Input
+ *
+ * The reference and requested intervals.
+ *
+ *
+ * Output
+ *
+ * A fasta file representing the requested intervals.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T FastaReference \
+ * -o output.fasta \
+ * -L input.intervals
+ *
+ *
*/
@WalkerName("FastaReferenceMaker")
public class FastaReferenceWalker extends RefWalker, GenomeLoc> {
+
@Output PrintStream out;
- @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) public int fastaLineWidth=60;
- @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity) - CAUTION: adjacent intervals will automatically be merged", required=false) public boolean fastaRawSeqs=false;
+
+ @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false)
+ public int fastaLineWidth=60;
+
+ /**
+ * Please note that when using this argument adjacent intervals will automatically be merged.
+ */
+ @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false)
+ public boolean fastaRawSeqs=false;
protected FastaSequence fasta;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java
index c555e88cd..bf3606b54 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java
@@ -45,6 +45,34 @@ import java.util.*;
/**
* Filters variant calls using a number of user-selectable, parameterizable criteria.
+ *
+ *
+ * VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria.
+ * Records are hard-filtered by changing the value in the FILTER field to something other than PASS.
+ *
+ *
Input
+ *
+ * A variant set to filter.
+ *
+ *
+ * Output
+ *
+ * A filtered VCF.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T VariantFiltration \
+ * -o output.vcf \
+ * --variant input.vcf \
+ * --filterExpression "AB < 0.2 || MQ0 > 50" \
+ * --filterName "Nov09filters" \
+ * --mask mask.vcf \
+ * --maskName InDel
+ *
+ *
*/
@Reference(window=@Window(start=-50,stop=50))
public class VariantFiltrationWalker extends RodWalker {
@@ -52,33 +80,65 @@ public class VariantFiltrationWalker extends RodWalker {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
+ /**
+ * Any variant which overlaps entries from the provided mask rod will be filtered.
+ */
@Input(fullName="mask", doc="Input ROD mask", required=false)
public RodBinding mask;
@Output(doc="File to which variants should be written", required=true)
protected VCFWriter writer = null;
- @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter (see wiki docs for more info)", required=false)
+ /**
+ * VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using
+ * --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
+ */
+ @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
protected ArrayList FILTER_EXPS = new ArrayList();
- @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
+
+ /**
+ * This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
+ */
+ @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
protected ArrayList FILTER_NAMES = new ArrayList();
+ /**
+ * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
+ * VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag).
+ * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
+ * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1").
+ */
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false)
protected ArrayList GENOTYPE_FILTER_EXPS = new ArrayList();
+
+ /**
+ * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
+ */
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
protected ArrayList GENOTYPE_FILTER_NAMES = new ArrayList();
- @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]", required=false)
+ /**
+ * Works together with the --clusterWindowSize argument.
+ */
+ @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false)
protected Integer clusterSize = 3;
- @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]", required=false)
+
+ /**
+ * Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1.
+ */
+ @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false)
protected Integer clusterWindow = 0;
- @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered; [default:0]", required=false)
+ @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false)
protected Integer MASK_EXTEND = 0;
- @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false)
+ @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
protected String MASK_NAME = "Mask";
- @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)?", required=false)
+ /**
+ * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing.
+ * Use this argument to have it evaluate as failing filters instead for these cases.
+ */
+ @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
protected Boolean FAIL_MISSING_VALUES = false;
// JEXL expressions for the filters
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
index 029b6deaf..d766ae8bd 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@@ -69,7 +69,7 @@ import java.util.*;
*
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
- * or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
+ * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java
index af8051334..17d5a8e9b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java
@@ -35,16 +35,46 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
+
/**
- * Left aligns indels in reads.
+ * Left-aligns indels from reads in a bam file.
+ *
+ *
+ * LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be
+ * placed at multiple positions and still represent the same haplotype. While a standard convention is to place an
+ * indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
+ *
+ *
Input
+ *
+ * A bam file to left-align.
+ *
+ *
+ * Output
+ *
+ * A left-aligned bam.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx3g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T LeftAlignIndels \
+ * -I input.bam \
+ * -o output.vcf
+ *
+ *
*/
public class LeftAlignIndels extends ReadWalker {
@Output(required=false, doc="Output bam")
protected StingSAMFileWriter writer = null;
- @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
- "If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false)
+ /**
+ * If set too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool
+ * may run out of memory. We recommend that you additionally tell Java to use a temp directory with plenty of available
+ * space (by setting java.io.tempdir on the command-line).
+ */
+ @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the output writer", required=false)
protected int MAX_RECORDS_IN_RAM = 500000;
public void initialize() {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
index ac4fba4b4..34c7912d9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
@@ -23,7 +23,10 @@
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
-import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.ArgumentCollection;
+import org.broadinstitute.sting.commandline.Hidden;
+import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@@ -49,16 +52,46 @@ import java.util.*;
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
-
/**
* Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads).
+ *
+ *
+ * Performs physical phasing of SNP calls, based on sequencing reads.
+ *
+ *
+ * Input
+ *
+ * VCF file of SNP calls, BAM file of sequence reads.
+ *
+ *
+ * Output
+ *
+ * Phased VCF file.
+ *
+ *
+ * Examples
+ *
+ * java
+ * -jar GenomeAnalysisTK.jar
+ * -T ReadBackedPhasing
+ * -R reference.fasta
+ * -I reads.bam
+ * --variant:vcf SNPs.vcf
+ * -BTI variant
+ * -BTIMR INTERSECTION
+ * -o phased_SNPs.vcf
+ * --phaseQualityThresh 20.0
+ *
+ *
+ * @author Menachem Fromer
+ * @since July 2010
*/
@Allows(value = {DataSource.READS, DataSource.REFERENCE})
@Requires(value = {DataSource.READS, DataSource.REFERENCE})
@By(DataSource.READS)
-@ReadFilters({MappingQualityZeroReadFilter.class})
// Filter out all reads with zero mapping quality
+@ReadFilters({MappingQualityZeroReadFilter.class})
public class ReadBackedPhasingWalker extends RodWalker {
private static final boolean DEBUG = false;
@@ -73,13 +106,13 @@ public class ReadBackedPhasingWalker extends RodWalker P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9
@Hidden
@@ -87,10 +120,10 @@ public class ReadBackedPhasingWalker extends RodWalker
* Simplest example of a locus walker.
+ *
+ *
+ * Input
+ *
+ * One or more BAM files.
+ *
+ *
+ * Output
+ *
+ * Number of loci traversed.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountLoci \
+ * -o output.txt \
+ * -I input.bam \
+ * [-L input.intervals]
+ *
+ *
*/
public class CountLociWalker extends LocusWalker implements TreeReducible {
@Output(doc="Write count to this file instead of STDOUT")
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
index 26fa9a258..e770418c1 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
@@ -39,6 +39,26 @@ import java.util.List;
* query name order. Breaks counts down by total pairs and number
* of paired reads.
*
+ *
+ * Input
+ *
+ * One or more bam files.
+ *
+ *
+ * Output
+ *
+ * Number of pairs seen.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountPairs \
+ * -o output.txt \
+ * -I input.bam
+ *
+ *
* @author mhanna
*/
public class CountPairsWalker extends ReadPairWalker {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java
similarity index 62%
rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java
rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java
index d1545f159..7c7d6417a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java
@@ -25,7 +25,10 @@
package org.broadinstitute.sting.gatk.walkers.qc;
+import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -33,25 +36,55 @@ import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.Pair;
+import java.util.Collections;
+import java.util.List;
+
/**
- * Prints out counts of the number of reference ordered data objects are
- * each locus for debugging RefWalkers.
+ * Prints out counts of the number of reference ordered data objects encountered.
+ *
+ *
+ * Input
+ *
+ * One or more rod files.
+ *
+ *
+ * Output
+ *
+ * Number of rods seen.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountRODsByRef \
+ * -o output.txt \
+ * --rod input.vcf
+ *
+ *
*/
-public class CountRodByRefWalker extends RefWalker, Long>> {
- @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
+public class CountRODsByRefWalker extends RefWalker, Long>> {
+
+ /**
+ * One or more input rod files
+ */
+ @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
+ public List> rods = Collections.emptyList();
+
+ @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
- @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
+ @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
public boolean showSkipped = false;
- CountRodWalker crw = new CountRodWalker();
+ CountRODsWalker crw = new CountRODsWalker();
public void initialize() {
crw.verbose = verbose;
crw.showSkipped = showSkipped;
}
- public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ public CountRODsWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return crw.map(tracker, ref, context);
}
@@ -59,7 +92,7 @@ public class CountRodByRefWalker extends RefWalker, Long> reduce(CountRodWalker.Datum point, Pair, Long> sum) {
+ public Pair, Long> reduce(CountRODsWalker.Datum point, Pair, Long> sum) {
return crw.reduce(point, sum);
}
}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java
similarity index 87%
rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java
rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java
index 8a03dea44..edbd5ff75 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java
@@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.qc;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
+import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -41,23 +44,46 @@ import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.LinkedList;
-import java.util.List;
+import java.util.*;
/**
- * Prints out counts of the number of reference ordered data objects are
- * each locus for debugging RodWalkers.
+ * Prints out counts of the number of reference ordered data objects encountered.
+ *
+ *
+ * Input
+ *
+ * One or more rod files.
+ *
+ *
+ * Output
+ *
+ * Number of rods seen.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountRODs \
+ * -o output.txt \
+ * --rod input.vcf
+ *
+ *
*/
-public class CountRodWalker extends RodWalker, Long>> implements TreeReducible, Long>> {
+public class CountRODsWalker extends RodWalker, Long>> implements TreeReducible, Long>> {
@Output
public PrintStream out;
- @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
+ /**
+ * One or more input rod files
+ */
+ @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
+ public List> rods = Collections.emptyList();
+
+ @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
- @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
+ @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
public boolean showSkipped = false;
@Override
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java
index 87c0409b9..9ce9c4eec 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java
@@ -9,8 +9,32 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
/**
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
+ *
+ *
* Can also count the number of reads matching a given criterion using read filters (see the
* --read-filter command line argument). Simplest example of a read-backed analysis.
+ *
+ *
+ *
Input
+ *
+ * One or more BAM files.
+ *
+ *
+ * Output
+ *
+ * Number of reads seen.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountReads \
+ * -o output.txt \
+ * -I input.bam \
+ * [-L input.intervals]
+ *
+ *
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CountReadsWalker extends ReadWalker {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
index 633c21320..d1fa3f4df 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -36,20 +36,61 @@ import java.util.*;
/**
* General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more)
+ *
+ *
+ * Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of
+ * raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls
+ * to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics
+ * by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid
+ * degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in
+ * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation
+ * and stratification modules.
+ *
+ *
Input
+ *
+ * One or more variant sets to evaluate plus any number of comparison sets.
+ *
+ *
+ * Output
+ *
+ * Evaluation tables.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T VariantEval \
+ * -o output.eval.gatkreport \
+ * --eval:set1 set1.vcf \
+ * --eval:set2 set2.vcf \
+ * [--comp comp.vcf]
+ *
+ *
*/
@Reference(window=@Window(start=-50, stop=50))
public class VariantEvalWalker extends RodWalker implements TreeReducible {
- // Output arguments
+
@Output
protected PrintStream out;
+ /**
+ * The variant file(s) to evaluate.
+ */
@Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true)
public List> evals;
+ /**
+ * The variant file(s) to compare against.
+ */
@Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false)
public List> compsProvided = Collections.emptyList();
private List> comps = new ArrayList>();
+ /**
+ * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants.
+ * Other sets can be specified with the -knownName (--known_names) argument.
+ */
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
@@ -67,6 +108,9 @@ public class VariantEvalWalker extends RodWalker implements Tr
@Argument(fullName="sample", shortName="sn", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false)
protected Set SAMPLE_EXPRESSIONS;
+ /**
+ * List of rod tracks to be used for specifying "known" variants other than dbSNP.
+ */
@Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
protected HashSet KNOWN_NAMES = new HashSet();
List> knowns = new ArrayList>();
@@ -81,7 +125,9 @@ public class VariantEvalWalker extends RodWalker implements Tr
@Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false)
protected Set typesToUse = null;
- // Evaluator arguments
+ /**
+ * See the -list argument to view available modules.
+ */
@Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
protected String[] MODULES_TO_USE = {};
@@ -95,7 +141,10 @@ public class VariantEvalWalker extends RodWalker implements Tr
@Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false)
protected double MIN_PHASE_QUALITY = 10.0;
- @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
+ /**
+ * This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined.
+ */
+ @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false)
protected String FAMILY_STRUCTURE;
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java
index c47a015c6..9fae71e4e 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java
@@ -46,6 +46,31 @@ import java.util.*;
/**
* Left-aligns indels from a variants file.
+ *
+ *
+ * LeftAlignVariants is a tool that takes a VCF file and left-aligns any indels inside it. The same indel can often be
+ * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to
+ * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
+ *
+ *
Input
+ *
+ * A variant set to left-align.
+ *
+ *
+ * Output
+ *
+ * A left-aligned VCF.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T LeftAlignVariants \
+ * --variant input.vcf \
+ * -o output.vcf
+ *
+ *
*/
@Reference(window=@Window(start=-200,stop=200))
public class LeftAlignVariants extends RodWalker {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
index 5c7fb268c..01a6e2f70 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
@@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
-import org.broad.tribble.Feature;
import org.broad.tribble.TribbleException;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.commandline.*;
@@ -34,7 +33,6 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
@@ -48,7 +46,32 @@ import java.util.Set;
/**
- * Validates a variants file.
+ * Strictly validates a variants file.
+ *
+ *
+ * ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it.
+ * Checks include the correctness of the reference base(s), accuracy of AC & AN values, tests against rsIDs
+ * when a dbSNP file is provided, and that all alternate alleles are present in at least one sample.
+ *
+ *
Input
+ *
+ * A variant set to filter.
+ *
+ *
+ * Output
+ *
+ * A filtered VCF.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T ValidateVariants \
+ * --variant input.vcf \
+ * --dbsnp dbsnp.vcf
+ *
+ *
*/
@Reference(window=@Window(start=0,stop=100))
public class ValidateVariants extends RodWalker {
@@ -67,10 +90,13 @@ public class ValidateVariants extends RodWalker {
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false)
protected ValidationType type = ValidationType.ALL;
- @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "should we skip validation on filtered records?", required = false)
+ /**
+ * By default, even filtered records are validated.
+ */
+ @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false)
protected Boolean DO_NOT_VALIDATE_FILTERED = false;
- @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "should we just emit warnings on errors instead of terminating the run?", required = false)
+ @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false)
protected Boolean WARN_ON_ERROR = false;
private long numErrors = 0;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
index 6ed0bbd16..b98646270 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
@@ -25,10 +25,8 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Input;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -43,21 +41,57 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
- * Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
+ * Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
+ *
+ *
+ * The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes).
+ * The tool produces a VCF that is annotated with information pertaining to plate quality control and by
+ * default is soft-filtered by high no-call rate or low Hardy-Weinberg probability.
+ * If you have .ped files, please first convert them to VCF format
+ * (see http://www.broadinstitute.org/gsa/wiki/index.php/Converting_ped_to_vcf).
+ *
+ *
Input
+ *
+ * A validation VCF to annotate.
+ *
+ *
+ * Output
+ *
+ * An annotated VCF.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T VariantValidationAssessor \
+ * --variant input.vcf \
+ * -o output.vcf
+ *
+ *
*/
@Reference(window=@Window(start=0,stop=40))
public class VariantValidationAssessor extends RodWalker {
- @Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true)
- public RodBinding variants;
+
+ @ArgumentCollection
+ protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfwriter = null;
- @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false)
+ @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid", required=false)
protected double maxHardy = 20.0;
- @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid [default:0.05]", required=false)
+
+ /**
+ * To disable, set to a value greater than 1.
+ */
+ @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid", required=false)
protected double maxNoCall = 0.05;
- @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false)
+
+ /**
+ * To disable, set to a value greater than 1.
+ */
+ @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid", required=false)
protected double maxHomNonref = 1.1;
//@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+
@@ -93,7 +127,7 @@ public class VariantValidationAssessor extends RodWalker
if ( tracker == null )
return null;
- VariantContext vc = tracker.getFirstValue(variants, ref.getLocus());
+ VariantContext vc = tracker.getFirstValue(variantCollection.variants, ref.getLocus());
// ignore places where we don't have a variant
if ( vc == null )
return null;
@@ -101,7 +135,7 @@ public class VariantValidationAssessor extends RodWalker
if ( sampleNames == null )
sampleNames = new TreeSet(vc.getSampleNames());
- return addVariantInformationToCall(ref, vc);
+ return addVariantInformationToCall(vc);
}
public Integer reduce(VariantContext call, Integer numVariants) {
@@ -113,7 +147,7 @@ public class VariantValidationAssessor extends RodWalker
}
public void onTraversalDone(Integer finalReduce) {
- final List inputNames = Arrays.asList(variants.getName());
+ final List inputNames = Arrays.asList(variantCollection.variants.getName());
// setup the header fields
Set hInfo = new HashSet();
@@ -159,7 +193,7 @@ public class VariantValidationAssessor extends RodWalker
}
- private VariantContext addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) {
+ private VariantContext addVariantInformationToCall(VariantContext vContext) {
// check possible filters
double hwPvalue = hardyWeinbergCalculation(vContext);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
index af3593ce4..b3fd540bc 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@@ -40,95 +40,105 @@ import java.io.PrintStream;
import java.util.*;
/**
- * Emits specific fields as dictated by the user from one or more VCF files.
+ * Emits specific fields from a VCF file to a tab-deliminated table
+ *
+ *
+ * This walker accepts a single VCF file and writes out user-selected fields from the
+ * VCF as a header-containing, tab-deliminated file. The user specifies one or more
+ * fields to print with the -F NAME, each of which appears as a single column in
+ * the output file, with a header named NAME, and the value of this field in the VCF
+ * one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding
+ * in the INFO field (AC=10). Note that this tool does not support capturing any
+ * GENOTYPE field values. If a VCF record is missing a value, then the tool by
+ * default throws an error, but the special value NA can be emitted instead with
+ * appropriate tool arguments.
+ *
+ *
+ *
+ * Input
+ *
+ *
+ * - A VCF file
+ * - A list of -F fields to write
+ *
+ *
+ *
+ * Output
+ *
+ * A table deliminated file containing the values of the requested fields in the VCF file
+ *
+ *
+ * Examples
+ *
+ * -T $WalkerName \
+ * -V file.vcf \
+ * -F CHROM -F POS -F ID -F QUAL -F AC \
+ * -o results.table
+ *
+ * would produce a file that looks like:
+ *
+ * CHROM POS ID QUAL AC
+ * 1 10 . 50 1
+ * 1 20 rs10 99 10
+ * et cetera...
+ *
+ *
+ * @author Mark DePristo
+ * @since 2010
*/
public class VariantsToTable extends RodWalker {
-
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@Output(doc="File to which results should be written",required=true)
protected PrintStream out;
- @Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true)
- public ArrayList fieldsToTake = new ArrayList();
+ /**
+ * -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10).
+ * Note that this tool does not support capturing any GENOTYPE field values. Note this argument
+ * accepts any number of inputs. So -F CHROM -F POS is allowed.
+ */
+ @Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
+ public List fieldsToTake = new ArrayList();
- @Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records")
+ /**
+ * By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
+ * Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
+ */
+ @Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
public boolean showFiltered = false;
- @Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false)
+ /**
+ * If provided, then this tool will exit with success after this number of records have been emitted to the file.
+ */
+ @Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
public int MAX_RECORDS = -1;
int nRecords = 0;
+ /**
+ * By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then
+ * VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this
+ * can make your resulting file unreadable and malformated according to tools like R, as the representation of
+ * multi-allelic INFO field values can be lists of values.
+ */
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
public boolean keepMultiAllelic = false;
+ /**
+ * By default, this tool throws a UserException when it encounters a field without a value in some record. This
+ * is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being
+ * found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such
+ * fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
+ * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
+ */
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
public boolean ALLOW_MISSING_DATA = false;
public void initialize() {
+ // print out the header
out.println(Utils.join("\t", fieldsToTake));
}
- public static abstract class Getter { public abstract String get(VariantContext vc); }
- public static Map getters = new HashMap();
-
- static {
- // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
- getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
- getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
- getters.put("REF", new Getter() {
- public String get(VariantContext vc) {
- String x = "";
- if ( vc.hasReferenceBaseForIndel() ) {
- Byte refByte = vc.getReferenceBaseForIndel();
- x=x+new String(new byte[]{refByte});
- }
- return x+vc.getReference().getDisplayString();
- }
- });
- getters.put("ALT", new Getter() {
- public String get(VariantContext vc) {
- StringBuilder x = new StringBuilder();
- int n = vc.getAlternateAlleles().size();
- if ( n == 0 ) return ".";
- if ( vc.hasReferenceBaseForIndel() ) {
- Byte refByte = vc.getReferenceBaseForIndel();
- x.append(new String(new byte[]{refByte}));
- }
-
- for ( int i = 0; i < n; i++ ) {
- if ( i != 0 ) x.append(",");
- x.append(vc.getAlternateAllele(i).getDisplayString());
- }
- return x.toString();
- }
- });
- getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
- getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
- if ( vc.isSNP() && vc.isBiallelic() )
- return VariantContextUtils.isTransition(vc) ? "1" : "0";
- else
- return "-1";
- }});
- getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
- return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
- });
-
- getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
- getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
- getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
- getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
- getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
- getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
- getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
- getters.put("GQ", new Getter() { public String get(VariantContext vc) {
- if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
- return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
- }});
- }
-
-
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
@@ -155,6 +165,15 @@ public class VariantsToTable extends RodWalker {
return s.endsWith("*");
}
+ /**
+ * Utility function that returns the list of values for each field in fields from vc.
+ *
+ * @param vc the VariantContext whose field values we can to capture
+ * @param fields a non-null list of fields to capture from VC
+ * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
+ * provides a value of NA
+ * @return
+ */
public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) {
List vals = new ArrayList();
@@ -213,13 +232,75 @@ public class VariantsToTable extends RodWalker {
return vals;
}
- public Integer reduceInit() {
- return 0;
- }
-
- public Integer reduce(Integer counter, Integer sum) {
- return counter + sum;
- }
-
+ //
+ // default reduce -- doesn't do anything at all
+ //
+ public Integer reduceInit() { return 0; }
+ public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
public void onTraversalDone(Integer sum) {}
+
+ // ----------------------------------------------------------------------------------------------------
+ //
+ // static system for getting values from VC by name.
+ //
+ // ----------------------------------------------------------------------------------------------------
+
+ public static abstract class Getter { public abstract String get(VariantContext vc); }
+ public static Map getters = new HashMap();
+
+ static {
+ // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
+ getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
+ getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
+ getters.put("REF", new Getter() {
+ public String get(VariantContext vc) {
+ String x = "";
+ if ( vc.hasReferenceBaseForIndel() ) {
+ Byte refByte = vc.getReferenceBaseForIndel();
+ x=x+new String(new byte[]{refByte});
+ }
+ return x+vc.getReference().getDisplayString();
+ }
+ });
+ getters.put("ALT", new Getter() {
+ public String get(VariantContext vc) {
+ StringBuilder x = new StringBuilder();
+ int n = vc.getAlternateAlleles().size();
+ if ( n == 0 ) return ".";
+ if ( vc.hasReferenceBaseForIndel() ) {
+ Byte refByte = vc.getReferenceBaseForIndel();
+ x.append(new String(new byte[]{refByte}));
+ }
+
+ for ( int i = 0; i < n; i++ ) {
+ if ( i != 0 ) x.append(",");
+ x.append(vc.getAlternateAllele(i).getDisplayString());
+ }
+ return x.toString();
+ }
+ });
+ getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
+ getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
+ if ( vc.isSNP() && vc.isBiallelic() )
+ return VariantContextUtils.isTransition(vc) ? "1" : "0";
+ else
+ return "-1";
+ }});
+ getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
+ return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
+ });
+
+ getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
+ getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
+ getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
+ getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
+ getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
+ getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
+ getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
+ getters.put("GQ", new Getter() { public String get(VariantContext vc) {
+ if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
+ return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
+ }});
+ }
+
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
index 1684dccfb..61851abe2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
@@ -53,6 +53,30 @@ import java.util.*;
/**
* Converts variants from other file formats to VCF format.
+ *
+ *
+ * Note that there must be a Tribble feature/codec for the file format as well as an adaptor.
+ *
+ *
Input
+ *
+ * A variant file to filter.
+ *
+ *
+ * Output
+ *
+ * A VCF file.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T VariantsToVCF \
+ * -o output.vcf \
+ * --variant:RawHapMap input.hapmap \
+ * --dbsnp dbsnp.vcf
+ *
+ *
*/
@Reference(window=@Window(start=-40,stop=40))
public class VariantsToVCF extends RodWalker {
@@ -61,15 +85,24 @@ public class VariantsToVCF extends RodWalker {
protected VCFWriter baseWriter = null;
private SortingVCFWriter vcfwriter; // needed because hapmap/dbsnp indel records move
+ /**
+ * Variants from this input file are used by this tool as input.
+ */
@Input(fullName="variant", shortName = "V", doc="Input variant file", required=true)
public RodBinding variants;
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
- @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
+ /**
+ * This argument is used for data (like GELI) with genotypes but no sample names encoded within.
+ */
+ @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod", required=false)
protected String sampleName = null;
+ /**
+ * This argument is useful for fixing input VCFs with bad reference bases (the output will be a fixed version of the VCF).
+ */
@Argument(fullName="fixRef", shortName="fixRef", doc="Fix common reference base in case there's an indel without padding", required=false)
protected boolean fixReferenceBase = false;
diff --git a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java
index f3e868474..3a242cb13 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java
@@ -88,14 +88,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest {
collect.intervals.add("intervals".toLowerCase());
collect.excludeIntervals = new ArrayList();
collect.numberOfThreads = 1;
-
- // make some rod bindings up
- ArrayList fakeBindings = new ArrayList();
- fakeBindings.add("Bind1");
- fakeBindings.add("Bind2");
- fakeBindings.add("Bind3");
-
- collect.RODBindings = fakeBindings;
}