diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java index 1b2129f3d..dcf7ed737 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java @@ -158,24 +158,28 @@ public class RecalibrationArgumentCollection implements Cloneable { /** * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off] */ + @Advanced @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false) public byte MISMATCHES_DEFAULT_QUALITY = -1; /** * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on] */ + @Advanced @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false) public byte INSERTIONS_DEFAULT_QUALITY = 45; /** * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on] */ + @Advanced @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) public byte DELETIONS_DEFAULT_QUALITY = 45; /** * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality */ + @Advanced @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) public byte LOW_QUAL_TAIL = 2; @@ -183,17 +187,19 @@ public class RecalibrationArgumentCollection implements Cloneable { * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. * This parameter tells BQSR the number of levels of quantization to use to build the quantization table. */ + @Advanced @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") public int QUANTIZING_LEVELS = 16; /** * The tag name for the binary tag covariate (if using it) */ + @Advanced @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") public String BINARY_TAG_NAME = null; - /* - * whether GATK report tables should have rows in sorted order, starting from leftmost column + /** + * Whether GATK report tables should have rows in sorted order, starting from leftmost column */ @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false) public Boolean SORT_BY_ALL_COLUMNS = false; @@ -219,7 +225,7 @@ public class RecalibrationArgumentCollection implements Cloneable { public PrintStream RECAL_TABLE_UPDATE_LOG = null; /** - * The repeat covariate will use a context of this size to calculate it's covariate value for base insertions and deletions + * The repeat covariate will use a context of this size to calculate its covariate value for base insertions and deletions */ @Hidden @Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java index 5fe0509cf..f66390fc1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java @@ -75,7 +75,7 @@ import java.util.List; /** * Allele count and frequency expectation per sample * - * Needs documentation + *
This annotation calculates the maximum likelihood (ML) number and frequency of alternate alleles for each individual sample at a site. In essence, it is equivalent to calculating the sum of "1"s in a genotype (for a biallelic site).
* */ @SuppressWarnings("unused") diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java index 8033b554d..7bdc365f1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java @@ -63,7 +63,7 @@ import java.util.*; /** - * Rank Sum Test of REF vs. ALT base quality scores + * Rank Sum Test of REF versus ALT base quality scores * *This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.
* diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java index a3034e658..3e70eea57 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java @@ -60,7 +60,7 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** - * Rank Sum Test for hard-clipped bases on REF vs. ALT reads + * Rank Sum Test for hard-clipped bases on REF versus ALT reads * *This variant-level annotation tests whether the data supporting the reference allele shows more or less base clipping (hard clips) than those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have more hard-clipped bases than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have fewer hard-clipped bases than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.
* @@ -68,7 +68,7 @@ import java.util.*; *The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
* *The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
+ *The clipping rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
* */ public class ClippingRankSumTest extends RankSumTest { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java index 8a0777245..f8404800e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java @@ -70,7 +70,7 @@ import java.util.List; import java.util.Map; /** - * Total depth of coverage per sample (in FORMAT) and over all samples (in INFO). + * Total depth of coverage per sample and over all samples. * *This annotation is used to provide counts of read depth at two different levels, with some important differences. At the sample level (FORMAT), the DP value is the count of reads that passed the caller's internal quality control metrics (such as MAPQ > 17, for example). At the site level (INFO), the DP value is the unfiltered depth over all samples.
* @@ -78,7 +78,7 @@ import java.util.Map; * *This annotation collects several genotype-level statistics from all samples and summarizes them in the INFO field. The following statistics are collected:
+ *These summaries can all be recomputed from the genotypes on the fly but it is a lot faster to add them here as INFO field annotations.
*/ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java index 7a9a123ed..458a1b696 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java @@ -82,7 +82,11 @@ import java.util.*; *The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is a Phred-scaled p-value derived from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.
* *This variant-level annotation compares the likelihoods of reads to their best haplotype match, between reads that support the reference allele and those that support the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower likelihoods to their best haplotype match than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher likelihoods to their best haplotype match than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.
* diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java index b957619f3..1cc87240b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java @@ -72,7 +72,7 @@ import java.util.*; /** * Likelihood of being a Mendelian Violation * - *This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible
This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible
This annotation considers all possible combinations of all possible genotypes (homozygous-reference, heterozygous, and homozygous-variant) for each member of a trio, which amounts to 27 possible combinations. Using the Phred-scaled genotype likelihoods (PL values) from each individual, the likelihood of each combination is calculated, and the result contributes to the likelihood of the corresponding case (mendelian violation or non-violation) depending on which set it belongs to. See the method document on statistical tests for a more detailed explanation of this statistical test.
@@ -82,7 +82,7 @@ import java.util.*; *This variant-level annotation compares the mapping qualities of the reads supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower mapping quality scores than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher mapping quality scores than those supporting the reference allele.
*This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java index b3027a695..d9ae3289a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java @@ -78,6 +78,9 @@ import java.util.Map; * *
This anotation gives you the count of all reads that have MAPQ = 0 across all samples. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.
* + *This annotation is excluded by HaplotypeCaller because HC filters out all reads with MQ0 upfront, so the annotation would always return a value of 0 anyway.
+ * *The calculation only takes into account coverage from samples genotyped as having the variant allele(s). This removes the influence of any homozygous-reference samples that might be present in the same cohort, which would otherwise penalize the call unfairly.
* - *This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.
* *This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and alternate alleles. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.
* diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java index 7b156f335..c1c226a81 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java @@ -69,9 +69,9 @@ import java.util.List; import java.util.Map; /** - * List of samples that are polymorphic at a given site + * List samples that are non-reference at a given site * - *The output is a list of the samples that are genotyped as having one or more variant alleles. This allows you to easily determine which samples are polymorphic and compare them to samples that are homozygous-reference.
+ *The output is a list of the samples that are genotyped as having one or more variant alleles. This allows you to easily determine which samples are non-reference (heterozygous or homozygous-variant) and compare them to samples that are homozygous-reference.
*/ public class SampleList extends InfoFieldAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java index a843f9f05..21632b5eb 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java @@ -73,9 +73,9 @@ import java.util.List; import java.util.Map; /** - * Number of forward and reverse reads that support each allele (including REF) + * Number of forward and reverse reads that support each allele * - *The StrandAlleleCountsBySample annotation produces read counts per allele and per strand. Note that, as with the AD annotation, the allele counts here should not be used to make assumptions about the called genotype.
+ *The StrandAlleleCountsBySample annotation produces read counts per allele (including (REF) and per strand. Note that, as with the AD annotation, the allele counts here should not be used to make assumptions about the called genotype.
* *This annotation produces 2 values per allele at each site, corresponding to the number of reads that support the following (in that order):
*Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.
* *Odds Ratios in the 2x2 contingency table below are + *
Odds Ratios in the 2x2 contingency table below are
* * $$ R = \frac{X[0][0] * X[1][1]}{X[0][1] * X[1][0]} $$ * - * and its inverse: + *and its inverse:
* *| + strand | - strand | |
| ALT; | X[1][0] | X[1][1] |
The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where
+ * * $$ refRatio = \frac{max(X[0][0], X[0][1])}{min(X[0][0], X[0][1} $$ - * and + * + *and
+ * * $$ altRatio = \frac{max(X[1][0], X[1][1])}{min(X[1][0], X[1][1]} $$ - * ensures that the annotation value is large only. - * + * + *ensures that the annotation value is large only.
+ * *See the method document on statistical tests for a more detailed explanation of this statistical test.
* *The calculation is based on the derivation described in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT.
* - *Note that this annotation requires a valid ped file.
- * *The tool can take up to three different sets of recalibration tables. * The resulting plots will be overlaid on top of each other to make - * comparisons easy. + * comparisons easy.
* + *| Set | Argument | Label | Color | Description |
|---|---|---|---|---|
| Original | -before | BEFORE | Maroon1 | + *|
| Original | -before | BEFORE | Pink | *First pass recalibration - * tables obtained from applying {@link BaseRecalibration} + * tables obtained from applying BaseRecalibration * on the original alignment. |
| Recalibrated | -after | AFTER | Blue | *Second pass recalibration tables - * results from the application of {@link BaseRecalibration} + * results from the application of BaseRecalibration * on the alignment recalibrated using the first pass tables |
| Input | -BQSR | BQSR | Black | *Any recalibration table without a specific role |
You need to specify at least one set. Multiple sets need to have the same values for the following parameters:
*
* covariate (order is not important), no_standard_covs, run_without_dbsnp, solid_recal_mode,
* solid_nocall_strategy, mismatches_context_size, mismatches_default_quality, deletions_default_quality,
* insertions_default_quality, maximum_cycle_value, low_quality_tail, default_platform, force_platform,
* quantizing_levels and binary_tag_name
+ *
A pdf document with plots that show the quality of the recalibration, and an optional csv file that contains a table with all the data required to generate those plots.
* - ** java -jar GenomeAnalysisTK.jar \ @@ -157,8 +147,8 @@ import java.util.Map; * ** - * # You can ignore the before/after semantics completely if you like (if you do add -ignoreLMT - * # to avoid a possible warning), but all tables should have been generated using the same parameters. + * # You can ignore the before/after semantics completely if you like (if you do, add -ignoreLMT + * # to avoid a possible warning), but all tables must have been generated using the same parameters. * * java -jar GenomeAnalysisTK.jar \ * -T AnalyzeCovariates \ @@ -173,31 +163,29 @@ import java.util.Map; *Full BQSR quality assessment pipeline
* *- * # Generate the first pass recalibration table file. + * # Generate the first pass recalibration table file * java -jar GenomeAnalysisTK.jar \ * -T BaseRecalibrator \ - * -R myreference.fasta \ + * -R reference.fasta \ * -I myinput.bam \ - * -knownSites bundle/my-trusted-snps.vcf \ # optional but recommendable - * -knownSites bundle/my-trusted-indels.vcf \ # optional but recommendable - * ... other options + * -knownSites bundle/my-trusted-snps.vcf \ # optional but recommended + * -knownSites bundle/my-trusted-indels.vcf \ # optional but recommended * -o firstpass.table * - * # Generate the second pass recalibration table file. + * # Generate the second pass recalibration table file * java -jar GenomeAnalysisTK.jar \ * -T BaseRecalibrator \ - * -BQSR firstpass.table \ - * -R myreference.fasta \ + * -R reference.fasta \ * -I myinput.bam \ * -knownSites bundle/my-trusted-snps.vcf \ * -knownSites bundle/my-trusted-indels.vcf \ - * ... other options \ + * -BQSR firstpass.table \ * -o secondpass.table * - * # Finally generate the plots and also keep a copy of the csv (optional). + * # Finally generate the plots and also keep a copy of the csv (optional) * java -jar GenomeAnalysisTK.jar \ * -T AnalyzeCovariates \ - * -R myrefernce.fasta \ + * -R reference.fasta \ * -before firstpass.table \ * -after secondpass.table \ * -csv BQSR.csv \ # optional @@ -251,14 +239,14 @@ public final class AnalyzeCovariates extends RodWalker+ * * This field value is resolved by {@link #initialize()}. */ protected File bqsrFile = null; /** * Checks inputs and argument values. - * + * * Notice that this routine will not validate the content of files. It may have some minor side effects as * the output of warning messages back to the user. * @@ -370,7 +358,6 @@ public final class AnalyzeCovariates extends RodWalker * If plotsFileisnull, it does not perform any plotting. * * @param csvFile the intermediary csv file. @@ -453,9 +440,9 @@ public final class AnalyzeCovariates extends RodWalker+ * * The key is the role and the value the corresponding report file. - * + * * Roles: "Before" (recalibration), "After" (recalibration), "BQSR" (the tool standard argument recalibration file) * * @return never null@@ -523,7 +510,7 @@ public final class AnalyzeCovariates extends RodWalker+ * * This is the the one specified by the user if any or a temporary file * that will be deleted as soon as the VM exists by default. * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java index d29f8931c..2c6744f97 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java @@ -86,49 +86,49 @@ import java.util.Arrays; import java.util.List; /** - * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context). + * Generate base recalibration table to compensate for systematic errors * * - * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating + * This tool is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative - * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group, - * reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical + * of poor base quality. This tool generates tables based on various user-specified covariates (such as read group, + * reported quality score, cycle, and context). Since there is a large amount of data, one can then calculate an empirical * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score). + *
*- * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified. - * - *
+ * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added regardless of whether + * or not they were specified. + *
* *Input
*- * The input read data whose base quality scores need to be assessed. + * A BAM file containing data that needs to be recalibrated. *
- * A database of known polymorphic sites to skip over. + * A database of known polymorphic sites to mask out. *
* *Output
- *- * A GATK Report file with many tables: - *
+ *
- * - * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table. + * + *A GATKReport file with many tables:
+ **
- The list of arguments
*- The quantized qualities table
*- The recalibration table by read group
*- The recalibration table by quality score
*- The recalibration table for all the optional covariates
- *+ * The GATKReport table format is intended to be easy to read by both humans and computer languages (especially R). + * Check out the documentation of the GATKReport (in the FAQs) to learn how to manipulate this table. *
* - *Examples
+ *Usage example
*- * java -Xmx4g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T BaseRecalibrator \ + * -R reference.fasta \ * -I my_reads.bam \ - * -R resources/Homo_sapiens_assembly18.fasta \ - * -knownSites bundle/hg18/dbsnp_132.hg18.vcf \ - * -knownSites another/optional/setOfSitesToMask.vcf \ + * -knownSites latest_dbsnp.vcf \ * -o recal_data.table **/ @@ -139,16 +139,16 @@ import java.util.List; @PartitionBy(PartitionType.READ) public class BaseRecalibrator extends ReadWalkerimplements NanoSchedulable { /** - * all the command line arguments for BQSR and it's covariates + * all the command line arguments for BQSR and its covariates */ @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); /** - * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency + * When you use nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag * to safely access only one table. There may be some CPU cost, but as long as the table is really big - * there should be relatively little CPU costs. + * the cost should be relatively reasonable. */ @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) public boolean lowMemoryMode = false; @@ -171,7 +171,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche private int minimumQToUse; - private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; + private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to mask out known variant sites. Please provide a VCF file containing known sites of genetic variation."; private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java index d5683504c..81c080d17 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java @@ -71,17 +71,16 @@ import java.util.LinkedList; import java.util.Map; /** - * Simple walker to plot the coverage distribution per base + * Evaluate coverage distribution per base * * - * Features of this walker: - *
includes a smart counting of uncovered bases without visiting the uncovered loci - *includes reads with deletions in the loci (optionally can be turned off) + * This tool reports the distribution of coverage per base. It includes reads with deletions in the counts unless + * otherwise specified. Quality filters can be applied before the coverage is calculated. * * *Input
*- * The BAM file and an optional interval list (works for WGS as well) + * The BAM file and an optional interval list *
* *Output
@@ -89,13 +88,13 @@ import java.util.Map; * A GATK Report with the coverage distribution per base * * - *Examples
+ *Usage example
*- * java -Xmx4g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ + * -R reference.fasta \ * -T BaseCoverageDistribution \ * -I myData.bam \ - * -L interesting.intervals \ + * -L intervals.list \ * -fd \ * -o report.grp *@@ -106,34 +105,34 @@ import java.util.Map; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class BaseCoverageDistribution extends LocusWalker, Map >> { /** - * The output GATK Report table + * The name of the file to output the GATK Report table. See the FAQs for more information on the GATK Report format. */ - @Output(doc = "The output GATK Report table") + @Output(doc = "Output filename") private PrintStream out; /** * Whether or not a deletion should be counted towards the coverage of a site */ - @Argument(required = false, shortName="del", fullName = "include_deletions", doc ="whether or not to include reads with deletions on the loci in the pileup") + @Argument(required = false, shortName="del", fullName = "include_deletions", doc ="Include reads with deletions") private boolean includeDeletions = true; /** - * Whether or not to calculate and output a filtered coverage distribution. Bases will be filtered according to the + * Whether or not to apply quality filters before calculating coverage distribution. Filtering will use the * minimum_mapping_quality and minimum_base_quality parameters below. */ - @Argument(required = false, shortName="fd", fullName = "filtered_distribution", doc ="calculate and report the filtered coverage distribution of bases") + @Argument(required = false, shortName="fd", fullName = "filtered_distribution", doc ="Apply quality filters") private boolean calculateFilteredDistribution = false; /** * The minimum mapping quality a read must have to be counted towards the filtered coverage of a site */ - @Argument(required = false, shortName="mmq", fullName = "minimum_mapping_quality", doc ="minimum mapping quality of a read to include it in the filtered coverage distribution") + @Argument(required = false, shortName="mmq", fullName = "minimum_mapping_quality", doc ="Minimum read mapping quality of a read to pass filters") private byte minMappingQuality = 20; /** * The minimum base quality a base must have to be counted towards the filtered coverage of a site */ - @Argument(required = false, shortName="mbq", fullName = "minimum_base_quality", doc ="minimum base quality of a base to include it in the filtered coverage distribution") + @Argument(required = false, shortName="mbq", fullName = "minimum_base_quality", doc ="Minimum base quality to pass filters") private byte minBaseQuality = 17; private GenomeLoc previousLocus = null; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java index cb4dda9fb..6f1718430 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java @@ -69,9 +69,10 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.io.PrintStream; /** - * Outputs a list of intervals that are covered above a given threshold. + * Outputs a list of intervals that are covered above a given threshold * - * The list can be used as an interval list for other walkers. Note that if the -uncovered argument is given, the tool will instead output intervals that fail the coverage threshold.
+ *The output list can be used as an interval list for other tools. Note that if the -uncovered argument is given, the + * logic will be inverted and the tool will instead output intervals that fail the coverage threshold.
* *Input
*@@ -85,9 +86,9 @@ import java.io.PrintStream; * *
Example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T FindCoveredIntervals \ - * -R ref.fasta \ + * -R reference.fasta \ * -I my_file.bam \ * -o output.list *diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index 5715f3b87..56097e625 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -77,44 +77,36 @@ import java.io.PrintStream; import java.util.*; /** - * Analyzes coverage distribution and validates read mates for a given interval and sample. - * + * Analyze coverage distribution and validate read mates per interval and per sample + * *- * Used to diagnose regions with bad coverage, mapping, or read mating. Analyzes each sample independently in addition - * to interval wide analysis. + * This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It analyzes each sample + * independently and aggregates results over intervals of interest. *
- * - * *Input
- **
*
- * - * *- A reference file
*- one or more input BAMs
*- One or more intervals
*Output
** A modified VCF detailing each interval by sample and information for each interval according to the thresholds used. - * Interval information includes GC Content, average interval depth, callable status among others. - * - * If you use the --missing option, you can get as a second output a intervals file with the loci that have missing data. + * Interval information includes GC Content, average interval depth, callable status among others. If you use the + * --missing option, you can get as a second output a intervals file with the loci that have missing data. * This file can then be used as input to QualifyMissingIntervals for full qualification and interpretation of why * the data is missing. *
- * - *Examples
+ *Usage example
*- * java - * -jar GenomeAnalysisTK.jar + * java -jar GenomeAnalysisTK.jar * -T DiagnoseTargets \ * -R reference.fasta \ - * -o output.vcf \ * -I sample1.bam \ * -I sample2.bam \ * -I sample3.bam \ - * -L intervals.interval_list + * -L intervals.interval_list \ + * -o output.vcf ** * @author Mauricio Carneiro, Roger Zurawicki diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java index d2bca0222..36cf28696 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java @@ -74,9 +74,9 @@ import java.io.PrintStream; import java.util.List; /** - * Walks along reference and calculates a few metrics for each interval. + * Collect quality metrics for a set of intervals * - * Metrics: + *This tool collects the following metrics:
**
* + *- Average Base Quality
*- Average Mapping Quality
@@ -88,9 +88,11 @@ import java.util.List; *- Length of the uncovered interval
*It is meant to be run on a set of intervals that have been identified as problematic in earlier stages of quality control and are considered "missing" from the sequence dataset.
+ * *Input
*- * A reference file (for GC content), the input bam file (for base and mapping quality calculation), the missing intervals (in the -L), the baits/targets used to sequence (in the -targets) and a bed file with the coding sequence intervals of the genome (in the -cds) + * A reference file (for GC content), the input bam file (for base and mapping quality calculation), the missing intervals (in the -L), the baits/targets used to sequence (in the -targets) and a bed file with the coding sequence intervals of the genome (in the -cds). *
* *Output
@@ -98,11 +100,11 @@ import java.util.List; * GC content, distance from the end of the target, coding sequence intersection, mapping and base quality averages and average depth per "missing" interval. * * - *Example
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T QualifyMissingIntervals \ - * -R ref.fasta \ + * -R reference.fasta \ * -I input.bam \ * -o output.grp \ * -L input.intervals \ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java index 31999f522..cb9827e2d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java @@ -87,15 +87,13 @@ import java.io.PrintStream; import java.util.*; /** - * A variant caller which unifies the approaches of several disparate callers -- Works for single-sample and multi-sample data. + * Call SNPs and indels on a per-locus basis * *- * The GATK Unified Genotyper is a multiple-sample, technology-aware SNP and indel caller. It uses a Bayesian genotype - * likelihood model to estimate simultaneously the most likely genotypes and allele frequency in a population of N samples, - * emitting an accurate posterior probability of there being a segregating variant allele at each locus as well as for the - * genotype of each sample. The system can either emit just the variant sites or complete genotypes (which includes - * homozygous reference calls) satisfying some phred-scaled confidence value. The genotyper can make accurate calls on - * both single sample data and multi-sample data. + * This tool uses a Bayesian genotype likelihood model to estimate simultaneously the most likely genotypes and + * allele frequency in a population of N samples, emitting a genotype for each sample. The system can either emit + * just the variant sites or complete genotypes (which includes homozygous reference calls) satisfying some + * phred-scaled confidence value. *
* *Input
@@ -108,48 +106,43 @@ import java.util.*; * A raw, unfiltered, highly sensitive callset in VCF format. * * - *Example generic command for multi-sample SNP calling
+ *Usage examples
+ *Multi-sample SNP calling
** java -jar GenomeAnalysisTK.jar \ - * -R resources/Homo_sapiens_assembly18.fasta \ * -T UnifiedGenotyper \ + * -R reference.fasta \ * -I sample1.bam [-I sample2.bam ...] \ * --dbsnp dbSNP.vcf \ * -o snps.raw.vcf \ * -stand_call_conf [50.0] \ * -stand_emit_conf 10.0 \ - * -dcov [50 for 4x, 200 for >30x WGS or Whole exome] \ * [-L targets.interval_list] ** - *- * The above command will call all of the samples in your provided BAM files [-I arguments] together and produce a VCF file - * with sites and genotypes for all samples. The easiest way to get the dbSNP file is from the GATK resource bundle (see Guide FAQs for details). Several - * arguments have parameters that should be chosen based on the average coverage per sample in your data. See the detailed - * argument descriptions below. - *
- * - *Example command for generating calls at all sites
+ *Generate calls at all sites
*- * java -jar /path/to/GenomeAnalysisTK.jar \ - * -l INFO \ - * -R resources/Homo_sapiens_assembly18.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T UnifiedGenotyper \ - * -I /DCC/ftp/pilot_data/data/NA12878/alignment/NA12878.SLX.maq.SRP000031.2009_08.bam \ - * -o my.vcf \ + * -R reference.fasta \ + * -I input.bam \ + * -o raw_variants.vcf \ * --output_mode EMIT_ALL_SITES ** *Caveats
*- *
* + *- The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and - * file formats are likely to change.
- *- The system can be very aggressive in calling variants. In the 1000 genomes project for pilot 2 (deep coverage of ~35x) - * we expect the raw Qscore > 50 variants to contain at least ~10% FP calls. We use extensive post-calling filters to eliminate - * most of these FPs. Variant Quality Score Recalibration is a tool to perform this filtering.
- *- The generalized ploidy model can be used to handle non-diploid or pooled samples (see the -ploidy argument in the table below).
+ *- The caller can be very aggressive in calling variants in order to be very sensitive, so the raw output will + * contain many false positives. We use extensive post-calling filters to eliminate most of these FPs. See the documentation on filtering (especially by Variant Quality Score Recalibration) for more details.
+ *- This tool has been deprecated in favor of HaplotypeCaller, a much more sophisticated variant caller that + * produces much better calls, especially on indels, and includes features that allow it to scale to much larger + * cohort sizes.
*Special note on ploidy
+ *This tool is able to handle almost any ploidy (except very high ploidies in large pooled experiments); the ploidy + * can be specified using the -ploidy argument for non-diploid organisms.
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 81ebace46..7fceeefff 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -139,8 +139,6 @@ import java.util.*; * read data to calculate the likelihoods of each genotype per sample given the read data observed for that * sample. The most likely genotype is then assigned to the sample. * - * - *
*Input
** Input bam file(s) from which to make calls @@ -148,13 +146,13 @@ import java.util.*; * *
Output
*- * VCF file with raw, unfiltered SNP and indel calls. These must be filtered either by variant recalibration (best) - * or hard-filtering before use in downstream analyses. If using the reference-confidence model workflow for cohort - * analysis, the output is a GVCF file that must first be run through GenotypeGVCFs and then filtering before further - * analysis. + * Either a VCF or gVCF file with raw, unfiltered SNP and indel calls. Regular VCFs must be filtered either by variant + * recalibration (best) or hard-filtering before use in downstream analyses. If using the reference-confidence model + * workflow for cohort analysis, the output is a GVCF file that must first be run through GenotypeGVCFs and then + * filtering before further analysis. *
* - *Examples
+ *Usage examples
* *These are example commands that show how to run HaplotypeCaller for typical use cases. Square brackets ("[ ]") * indicate optional arguments. Note that parameter values shown here may not be the latest recommended; see the @@ -162,12 +160,11 @@ import java.util.*; * *
*Single-sample all-sites calling on DNAseq (for `-ERC GVCF` cohort analysis workflow)
- **
* java * -jar GenomeAnalysisTK.jar * -T HaplotypeCaller - * -R reference/human_g1k_v37.fasta + * -R reference.fasta * -I sample1.bam \ * --emitRefConfidence GVCF \ * --variant_index_type LINEAR \ @@ -176,15 +173,13 @@ import java.util.*; * [-L targets.interval_list] \ * -o output.raw.snps.indels.g.vcf *- * * *Variant-only calling on DNAseq
- **
* java * -jar GenomeAnalysisTK.jar * -T HaplotypeCaller - * -R reference/human_g1k_v37.fasta + * -R reference.fasta * -I sample1.bam [-I sample2.bam ...] \ * [--dbsnp dbSNP.vcf] \ * [-stand_call_conf 30] \ @@ -192,23 +187,19 @@ import java.util.*; * [-L targets.interval_list] \ * -o output.raw.snps.indels.vcf *- * * *Variant-only calling on RNAseq
- **
* java * -jar GenomeAnalysisTK.jar * -T HaplotypeCaller - * -R reference/human_g1k_v37.fasta + * -R reference.fasta * -I sample1.bam \ - * -dontUseSoftClippedBases \ * [--dbsnp dbSNP.vcf] \ * -stand_call_conf 20 \ * -stand_emit_conf 20 \ * -o output.raw.snps.indels.vcf *- * * *Caveats
*@@ -218,6 +209,10 @@ import java.util.*; * parallelize HaplotypeCaller instead of multithreading. *
* + *Special note on ploidy
+ *This tool is able to handle almost any ploidy (except very high ploidies in large pooled experiments); the ploidy + * can be specified using the -ploidy argument for non-diploid organisms.
+ * *Additional Notes
**
- When working with PCR-free data, be sure to set `-pcr_indel_model NONE` (see argument below).
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java index 3dae25427..8bf383240 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java @@ -82,16 +82,17 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; import java.util.*; /** - * Haplotype-based resolution of variants in 2 different eval files. + * Haplotype-based resolution of variants in separate callsets. * *- * HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them. + * HaplotypeResolver is a tool that takes two VCF files and constructs haplotypes based on the variants inside them. * From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants. * Records are annotated with the set and status attributes. + *
* *Input
*- * 2 variant files to resolve. + * Two variant files to resolve. *
* *Output
@@ -99,11 +100,11 @@ import java.util.*; * A single consensus VCF. * * - *Examples
+ *Usage example
*- * java -Xmx1g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T HaplotypeResolver \ + * -R reference.fasta \ * -V:v1 input1.vcf \ * -V:v2 input2.vcf \ * -o output.vcf diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java index 6cd78be45..75df30550 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java @@ -91,10 +91,10 @@ import java.io.IOException; import java.util.*; /** - * Performs local realignment of reads to correct misalignments due to the presence of indels. + * Perform local realignment of reads around indels * *- * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * The local realignment process is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, @@ -105,12 +105,13 @@ import java.util.*; * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and * specifically identify indels. *
- *There are 2 steps to the realignment process: + *
There are 2 steps to the realignment process:
+ **
*- Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
*- Running the realigner over those intervals (IndelRealigner)
*- * For more details, see http://www.broadinstitute.org/gatk/guide/article?id=38 + * For more details, see the indel realignment method documentation. *
* *Input
@@ -123,26 +124,24 @@ import java.util.*; * A realigned version of your input BAM file(s). * * - *Example
+ *Usage example
*- * java -Xmx4g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T IndelRealigner \ - * -R ref.fasta \ + * -R reference.fasta \ * -I input.bam \ + * --known indels.vcf \ * -targetIntervals intervalListFromRTC.intervals \ - * -o realignedBam.bam \ - * [-known /path/to/indels.vcf] \ - * [-compress 0] (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value) + * -o realignedBam.bam ** *Caveats
- * - *+ *
- - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. - *
- - * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them - * (or with reads from similar technologies). - *
+ *
* * @author ebanks */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java index e6fd8d13b..a6afa812a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java @@ -66,30 +66,35 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; /** - * Left-aligns indels from reads in a bam file. + * Left-align indels within reads in a bam file * - *- The input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.
+ *- Because reads produced from the 454 technology inherently contain false indels, the realigner will not work with them + * (or with reads from similar technologies).
+ *- This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.
+ *- * LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be - * placed at multiple positions and still represent the same haplotype. While a standard convention is to place an - * indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + *
This tool left-aligns any indels within read cigars in order to standardize representation when there are multiple valid + * representations possible (i.e. where the same indel can be placed at multiple positions and still represent the same haplotype). + * The standard convention is to place an indel at the left-most position possible, but this is not always followed, so + * this tool can be used to correct the representation of indels.
+ * + *Note
+ *This is only really needed when calling variants with legacy locus-based tools such as UnifiedGenotyper. With more + * sophisticated tools (like HaplotypeCaller) that involve reconstructing haplotypes (eg through reassembly), the problem + * of multiple valid representations is handled internally and does not need to be corrected explicitly.
* *Input
*- * A bam file to left-align. + * A bam file with mapped reads. *
* *Output
*- * A left-aligned bam. + * A bam file in which indels have been left-aligned where appropriate. *
* - *Examples
+ *Usage example
*- * java -Xmx3g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ + * -R reference.fasta \ * -T LeftAlignIndels \ - * -I input.bam \ - * -o output.vcf + * -I reads.bam \ + * -o output_with_leftaligned_indels.bam ** */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java index 186296b1e..a81af2e5a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java @@ -77,54 +77,59 @@ import java.util.List; import java.util.TreeSet; /** - * Emits intervals for the Local Indel Realigner to target for realignment. + * Define intervals to target for local realignment * *- * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * The local realignment process is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, * it is impossible to place reads on the reference genome such that mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus - * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an - * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and - * specifically identify indels. - *
- *
There are 2 steps to the realignment process: + * indel suitable for standard variant discovery approaches. Unlike most mappers, this tool uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. + * + *
There are 2 steps to the realignment process:
+ **
*- Determining (small) suspicious intervals which are likely in need of realignment (RealignerTargetCreator)
*- Running the realigner over those intervals (see the IndelRealigner tool)
*- * Important note 1: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. *
- * Important note 2: when multiple potential indels are found by the tool in the same general region, the tool will choose the most likely - * one for realignment to the exclusion of the others. This is a known limitation of the tool. - *
- * Important note 3: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them - * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. + * For more details, see the indel realignment method documentation. + *
* - *Input
+ *Inputs
*- * One or more aligned BAM files and optionally one or more lists of known indels. + * One or more aligned BAM files and optionally, one or more lists of known indels. *
* *Output
*- * A list of target intervals to pass to the Indel Realigner. + * A list of target intervals to pass to the IndelRealigner. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T RealignerTargetCreator \ - * -R ref.fasta \ + * -R reference.fasta \ * -I input.bam \ - * -o forIndelRealigner.intervals \ - * [--known /path/to/indels.vcf] + * --known indels.vcf \ + * -o forIndelRealigner.intervals ** - * @author ebanks + *Notes
+ *+ *
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, BadMateFilter.class, Platform454Filter.class}) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java index a2bd36ad3..709711064 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java @@ -78,17 +78,25 @@ import java.io.PrintStream; import java.util.*; /** - * Computes the most likely genotype combination and phases trios and parent/child pairs + * Compute the most likely genotype combination and phasing for trios and parent/child pairs * *- The input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.
+ *- When multiple potential indels are found by the tool in the same general region, the tool will choose the most likely + * one for realignment to the exclusion of the others. This is a known limitation of the tool.
+ *- Because reads produced from the 454 technology inherently contain false indels, the realigner will not work with them + * (or with reads from similar technologies).
+ *- This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.
+ *- * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases - * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. - * Ambiguous sites are: + * This tool performs two functions: + *
+ *+ *
+ * + *- Compute the most likely genotype combination of trios and parent/child pairs given their genotype likelihoods and a mutation prior;
+ *- Phase all sites were parent/child transmission can be inferred unambiguously.
+ *The tool ultimately reports the genotype combination (and hence phasing) probability.
+ * + *Ambiguous sites are:
**
- * Missing genotypes are handled as follows: + * + *- Sites where all individuals are heterozygous
*- Sites where there is a Mendelian violation
*Missing genotypes are handled as follows:
**
* * - *- In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
*- In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
@@ -104,26 +112,26 @@ import java.util.*; *Options
- *+ *
Important options
*- *
- * * *- MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype - * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, - * and each individual genotype, depth, allelic depth and likelihoods.
+ *- MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that + * remain in mendelian violation after being assigned the most likely genotype combination will be reported + * there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission + * probability, and each individual genotype, depth, allelic depth and likelihoods.
*- DeNovoPrior: Mutation prio; default is 1e-8
*Output
*- * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. + * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent (where non + * ambiguous). *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T PhaseByTransmission \ + * -R reference.fasta \ * -V input.vcf \ * -ped input.ped \ * -o output.vcf diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java index c501d842a..ad46e191b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java @@ -87,13 +87,17 @@ import java.util.*; import static org.broadinstitute.gatk.engine.GATKVCFUtils.getVCFHeadersFromRods; /** - * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). + * Annotate physical phasing information * - * The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites. + *This tool identifies haplotypes based on the overlap between reads and uses this information to generate physical + * phasing information for variants within these haplotypes.
* - * The underlying algorithm is based on building up 2^n local haplotypes, - * where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter). - * Then, these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype the alleles of a genotype at a particular locus belong (denoted by the HP tag). + *It operates by walking along all variant ROD loci, caching a user-defined window of VariantContext sites, and + * then finishes phasing them when they go out of range (using upstream and downstream reads). The underlying algorithm + * is based on building up 2^n local haplotypes, where n is the number of heterozygous SNPs in the local region we + * expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter). Then, + * these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype + * the alleles of a genotype at a particular locus belong (denoted by the HP tag).
* ** Performs physical phasing of SNP calls, based on sequencing reads. @@ -109,19 +113,21 @@ import static org.broadinstitute.gatk.engine.GATKVCFUtils.getVCFHeadersFromRods; * Phased VCF file. *
* - *Examples
+ *Usage example
*- * java - * -jar GenomeAnalysisTK.jar - * -T ReadBackedPhasing - * -R reference.fasta - * -I reads.bam - * --variant SNPs.vcf - * -L SNPs.vcf - * -o phased_SNPs.vcf + * java -jar GenomeAnalysisTK.jar \ + * -T ReadBackedPhasing \ + * -R reference.fasta \ + * -I reads.bam \ + * --variant SNPs.vcf \ + * -L SNPs.vcf \ + * -o phased_SNPs.vcf \ * --phaseQualityThresh 20.0 ** + *Caveat
+ *The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites.
+ * * @author Menachem Fromer * @since July 2010 */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java index 9958a4f7b..4aaaf9c77 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java @@ -82,16 +82,37 @@ import java.util.ArrayList; /** * - * Splits reads that contain Ns in their cigar string (e.g. spanning splicing events). + * Splits reads that contain Ns in their CIGAR string * - * Identifies all N cigar elements and creates k+1 new reads (where k is the number of N cigar elements). - * The first read includes the bases that are to the left of the first N element, while the part of the read that is to the right of the N - * (including the Ns) is hard clipped and so on for the rest of the new reads. + *This tool identifies all N cigar elements in sequence reads, and creates k+1 new reads + * (where k is the number of N cigar elements) that correspond to the segments of the original read beside/between + * the splicing events represented by the Ns in the original CIGAR. The first read includes the bases that are to the + * left of the first N element, while the part of the read that is to the right of the N (including the Ns) is hard + * clipped, and so on for the rest of the new reads.
* + *Input
+ *+ * One or more bam files. + *
+ * + *Output
+ *+ * A single processed bam file. + *
+ * + *Usage example
+ *+ * java -jar GenomeAnalysisTK.jar \ + * -T SplitNCigarReads \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.bam \ + * -U ALLOW_N_CIGARS + * + *Note
+ *When this tool is used as part of the RNAseq best practices, the command should include mapping quality + * reassignment. See the Best Practices documentation for details.
* - * User: ami - * Date: 11/14/13 - * Time: 11:52 AM */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java index d8350d9e4..d2ad91e97 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java @@ -79,22 +79,24 @@ import htsjdk.variant.vcf.VCFConstants; import java.util.*; /** - * Generates simulated reads for variants + * Generate simulated reads for variants * *Given a set of variants, this tool will generate simulated reads that support the input variants.
* - *Caveats
- *For practical reasons, only bi-allelic variants that are not too close to the ends of contigs (< 1/2 read length) are supported; all others will simply be ignored.
+ *Caveat
+ *For practical reasons, only bi-allelic variants that are not too close to the ends of contigs + * (< 1/2 read length) are supported; all others will simply be ignored.
* *Input
*A VCF file containing variants.
* *Output
- *A BAM file containing simulated sequence reads that support the input variants, with the requested error rate and coverage depth.
+ *A BAM file containing simulated sequence reads that support the input variants, with the requested error rate + * and coverage depth.
* - *Example
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T SimulateReadsForVariants \ * -R reference.fasta \ * -V input_variants.vcf \ @@ -105,7 +107,6 @@ import java.util.*; * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}, gotoDev = HelpConstants.EB) - @Reference(window=@Window(start=-200,stop=200)) public class SimulateReadsForVariants extends RodWalker{ private static Logger logger = Logger.getLogger(SimulateReadsForVariants.class); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java index ddf0760f5..09346919a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java @@ -78,7 +78,7 @@ import java.util.Set; import static org.broadinstitute.gatk.utils.IndelUtils.isInsideExtendedIndel; /** - * Genotypes a dataset and validates the calls of another dataset using the Unified Genotyper. + * Genotype and validate a dataset and the calls of another dataset using the Unified Genotyper * * Note that this is an old tool that makes use of the UnifiedGenotyper, which has since been * deprecated in favor of the HaplotypeCaller.
@@ -182,36 +182,30 @@ import static org.broadinstitute.gatk.utils.IndelUtils.isInsideExtendedIndel; * * * - *Examples
- *- *
- - * Genotypes BAM file from new technology using the VCF as a truth dataset: - *
- * + *Usage examples
+ *Genotypes BAM file from new technology using the VCF as a truth dataset
** java - * -jar /GenomeAnalysisTK.jar - * -T GenotypeAndValidate - * -R human_g1k_v37.fasta - * -I myNewTechReads.bam - * -alleles handAnnotatedVCF.vcf - * -L handAnnotatedVCF.vcf + * -jar GenomeAnalysisTK.jar \ + * -T GenotypeAndValidate \ + * -R reference.fasta \ + * -I myNewTechReads.bam \ + * -alleles handAnnotatedVCF.vcf \ + * -L handAnnotatedVCF.vcf \ + * -o output.vcf ** - *- - * Using a BAM file as the truth dataset: - *
- * + *Genotypes BAM file from new technology a BAM file as the truth dataset
** java - * -jar /GenomeAnalysisTK.jar - * -T GenotypeAndValidate - * -R human_g1k_v37.fasta - * -I myTruthDataset.bam - * -alleles callsToValidate.vcf - * -L callsToValidate.vcf - * -bt - * -o gav.vcf + * -jar GenomeAnalysisTK.jar \ + * -T GenotypeAndValidate \ + * -R reference.fasta \ + * -I myTruthDataset.bam \ + * -alleles callsToValidate.vcf \ + * -L callsToValidate.vcf \ + * -bt \ + * -o output.vcf ** */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java index e31188beb..01572780d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java @@ -73,22 +73,28 @@ import java.util.*; /** - * Randomly selects VCF records according to specified options. + * Randomly select variant records according to specified options * *- * ValidationSiteSelectorWalker is intended for use in experiments where we sample data randomly from a set of variants, for example - * in order to choose sites for a follow-up validation study. + * This tool is intended for use in experiments where we sample data randomly from a set of variants, for example + * in order to choose sites for a follow-up validation study.
* - * Sites are selected randomly but within certain restrictions. There are two main sources of restrictions - * a) Sample restrictions. A user can specify a set of samples, and we will only consider sites which are polymorphic within such given sample subset. - * These sample restrictions can be given as a set of individual samples, a text file (each line containing a sample name), or a regular expression. - * A user can additionally specify whether samples will be considered based on their genotypes (a non-reference genotype means that such sample is polymorphic in that variant, - * and hence that variant will be considered for inclusion in set), or based on their PLs. - * b) A user can additionally specify a sampling method based on allele frequency. Two sampling methods are currently supported. - * 1. Uniform sampling will just sample uniformly from variants polymorphic in selected samples. - * 2. Sampling based on Allele Frequency spectrum will ensure that output sites have the same AF distribution as the input set. - * - * User can additionally restrict output to a particular type of variant (SNP, Indel, etc.) + *Sites are selected randomly but within certain restrictions. There are two main sources of restrictions:
+ *+ *
* *- Sample restrictions: A user can specify a set of samples, and we will only consider sites which are + * polymorphic within the given sample subset. These sample restrictions can be given as a set of individual + * samples, a text file (each line containing a sample name), or a regular expression. A user can additionally + * specify whether samples will be considered based on their genotypes (a non-reference genotype means that the + * sample is polymorphic in that variant, and hence that variant will be considered for inclusion in set), or + * based on their PLs.
+ *- Sampling methods: + *
+ *+ *
+ *- Uniform sampling will just sample uniformly from variants that are polymorphic in selected samples
+ *- Sampling based on Allele Frequency spectrum will ensure that output sites have the same AF distribution as the input set
+ *- Variant type (SNP, Indel, etc.)
+ *Input
*@@ -100,29 +106,30 @@ import java.util.*; * A sites-only VCF with the desired number of randomly selected sites. *
* - *Examples
+ *Usage examples
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T ValidationSiteSelectorWalker \ - * --variant input1.vcf \ - * --variant input2.vcf \ + * -R reference.fasta \ + * -V input1.vcf \ + * -V input2.vcf \ * -sn NA12878 \ * -o output.vcf \ * --numValidationSites 200 \ - * -sampleMode POLY_BASED_ON_GT \ + * -sampleMode POLY_BASED_ON_GT \ * -freqMode KEEP_AF_SPECTRUM - * - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *+ *+ * java -jar GenomeAnalysisTK.jar \ * -T ValidationSiteSelectorWalker \ - * --variant:foo input1.vcf \ - * --variant:bar input2.vcf \ + * -R reference.fasta \ + * -V:foo input1.vcf \ + * -V:bar input2.vcf \ * --numValidationSites 200 \ * -sf samples.txt \ * -o output.vcf \ * -sampleMode POLY_BASED_ON_GT \ - * -freqMode UNIFORM + * -freqMode UNIFORM \ * -selectType INDEL ** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java index bb063e956..2361ca64e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java @@ -113,11 +113,11 @@ import java.util.*; *- A recalibrated VCF file in which each variant of the requested type is annotated with its VQSLOD and marked as filtered if the score is below the desired quality level.
* * - *Example for filtering SNPs
+ *Usage example for filtering SNPs
** java -Xmx3g -jar GenomeAnalysisTK.jar \ * -T ApplyRecalibration \ - * -R reference/human_g1k_v37.fasta \ + * -R reference.fasta \ * -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \ * --ts_filter_level 99.0 \ * -tranchesFile path/to/output.tranches \ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java index dd0a4acff..8021db111 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java @@ -111,7 +111,7 @@ import java.util.*; *Inputs
**
* *- The input raw variants to be recalibrated.
- *- Known, truth, and training sets to be used by the algorithm.
+ *- Known, truth, and training sets to be used by the algorithm. See the method documentation for more details.
*Output
@@ -120,21 +120,22 @@ import java.util.*; *- A tranches file which shows various metrics of the recalibration callset for slices of the data.
* * - *Example for recalibrating SNPs in exome data
+ *Usage example
+ *Recalibrating SNPs in exome data:
** java -Xmx4g -jar GenomeAnalysisTK.jar \ * -T VariantRecalibrator \ - * -R reference/human_g1k_v37.fasta \ - * -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \ + * -R reference.fasta \ + * -input raw_variants.vcf \ * -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \ * -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \ * -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf * -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \ * -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an InbreedingCoeff \ * -mode SNP \ - * -recalFile path/to/output.recal \ - * -tranchesFile path/to/output.tranches \ - * -rscriptFile path/to/output.plots.R + * -recalFile output.recal \ + * -tranchesFile output.tranches \ + * -rscriptFile output.plots.R ** *Caveats
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriors.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriors.java index 421cb8386..15b0ed0e8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriors.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriors.java @@ -77,26 +77,20 @@ import htsjdk.variant.vcf.*; import java.util.*; /** - * Calculates genotype posterior likelihoods given panel data + * Calculate genotype posterior likelihoods given panel data * ** Given a VCF with genotype likelihoods from the HaplotypeCaller, UnifiedGenotyper, or another source which provides - * -unbiased- GLs, calculate the posterior genotype state and likelihood given allele frequency information from - * both the samples themselves and input VCFs describing allele frequencies in related populations. + * -unbiased- genotype likelihoods, calculate the posterior genotype state and likelihood given allele frequency + * information from both the samples themselves and input VCFs describing allele frequencies in related populations.
* - * VCFs to use for informing the genotype likelihoods (e.g. a population-specific VCF from 1000 genomes) should have - * at least one of: - * - AC field and AN field - * - MLEAC field and AN field - * - genotypes - * - * The AF field will not be used in this calculation as it does not provide a way to estimate the confidence interval + *The AF field will not be used in this calculation as it does not provide a way to estimate the confidence interval * or uncertainty around the allele frequency, while AN provides this necessary information. This uncertainty is * modeled by a Dirichlet distribution: that is, the frequency is known up to a Dirichlet distribution with * parameters AC1+q,AC2+q,...,(AN-AC1-AC2-...)+q, where "q" is the global frequency prior (typically q << 1). The * genotype priors applied then follow a Dirichlet-Multinomial distribution, where 2 alleles per sample are drawn * independently. This assumption of independent draws is the assumption Hardy-Weinberg Equilibrium. Thus, HWE is - * imposed on the likelihoods as a result of CalculateGenotypePosteriors. + * imposed on the likelihoods as a result of CalculateGenotypePosteriors.
* *Input
*@@ -104,26 +98,28 @@ import java.util.*; *
- A VCF with genotype likelihoods, and optionally genotypes, AC/AN fields, or MLEAC/AN fields
*- (Optional) A PED pedigree file containing the description of the individuals relationships.
* - * * * ** A collection of VCFs to use for informing allele frequency priors. Each VCF must have one of - * - AC field and AN field - * - MLEAC field and AN field - * - genotypes + *
+ *+ *
* * *- AC field and AN field
+ *- MLEAC field and AN field
+ *- genotypes
+ *Output
- *- * A new VCF with: - * 1) Genotype posteriors added to the genotype fields ("PP") - * 2) Genotypes and GQ assigned according to these posteriors - * 3) Per-site genotype priors added to the INFO field ("PG") - * 4) (Optional) Per-site, per-trio joint likelihoods (JL) and joint posteriors (JL) given as Phred-scaled probability + *
A new VCF with:
+ *+ *
* *- Genotype posteriors added to the genotype fields ("PP")
+ *- Genotypes and GQ assigned according to these posteriors
+ *- Per-site genotype priors added to the INFO field ("PG")
+ *- (Optional) Per-site, per-trio joint likelihoods (JL) and joint posteriors (JL) given as Phred-scaled probability * of all genotypes in the trio being correct based on the PLs for JL and the PPs for JP. These annotations are added to - * the genotype fields. - * + * the genotype fields.
+ *Notes
*@@ -135,51 +131,57 @@ import java.util.*; * the input callset. *
* - *Examples
+ *Usage examples
+ *Inform the genotype assignment of NA12878 using the 1000G Euro panel
*- * Inform the genotype assignment of NA12878 using the 1000G Euro panel - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T CalculateGenotypePosteriors \ + * -R reference.fasta \ * -V NA12878.wgs.HC.vcf \ * -supporting 1000G_EUR.genotypes.combined.vcf \ * -o NA12878.wgs.HC.posteriors.vcf \ * - * Refine the genotypes of a large panel based on the discovered allele frequency - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Refine the genotypes of a large panel based on the discovered allele frequency
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T CalculateGenotypePosteriors \ + * -R reference.fasta \ * -V input.vcf \ * -o output.withPosteriors.vcf + ** - * Apply frequency and HWE-based priors to the genotypes of a family without including the family allele counts - * in the allele frequency estimates - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Apply frequency and HWE-based priors to the genotypes of a family without including the family allele counts + * in the allele frequency estimates the genotypes of a large panel based on the discovered allele frequency
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T CalculateGenotypePosteriors \ + * -R reference.fasta \ * -V input.vcf \ * -o output.withPosteriors.vcf \ * --ignoreInputSamples + ** - * Calculate the posterior genotypes of a callset, and impose that a variant *not seen* in the external panel - * is tantamount to being AC=0, AN=100 within that panel - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Calculate the posterior genotypes of a callset, and impose that a variant *not seen* in the external panel + * is tantamount to being AC=0, AN=100 within that panel
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T CalculateGenotypePosteriors \ + * -R reference.fasta \ * -supporting external.panel.vcf \ * -V input.vcf \ - * -o output.withPosteriors.vcf + * -o output.withPosteriors.vcf \ * --numRefSamplesIfNoCall 100 - * - * Apply only family priors to a callset - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ - * -T CalculateGenotypePosteriors \ - * -V input.vcf \ - * --skipPopulationPriors - * -ped family.ped - * -o output.withPosteriors.vcf + ** + *Apply only family priors to a callset
+ *+ * java -jar GenomeAnalysisTK.jar \ + * -T CalculateGenotypePosteriors \ + * -R reference.fasta \ + * -V input.vcf \ + * --skipPopulationPriors \ + * -ped family.ped \ + * -o output.withPosteriors.vcf ** */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java index 7e7926f3a..443c8d6b9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java @@ -74,38 +74,39 @@ import htsjdk.variant.vcf.*; import java.util.*; /** - * Combines any number of gVCF files that were produced by the Haplotype Caller into a single joint gVCF file. + * Combine per-sample gVCF files produced by HaplotypeCaller into a multi-sample gVCF file * ** CombineGVCFs is meant to be used for hierarchical merging of gVCFs that will eventually be input into GenotypeGVCFs. * One would use this tool when needing to genotype too large a number of individual gVCFs; instead of passing them * all in to GenotypeGVCFs, one would first use CombineGVCFs on smaller batches of samples and then pass these combined - * gVCFs to GenotypeGVCFs. - * - * Note that this tool cannot work with just any gVCF files - they must have been produced with the Haplotype Caller - * as part of the "single sample discovery" pipeline using the '-ERC GVCF' mode, which uses a sophisticated reference - * model to produce accurate genotype likelihoods for every position in the target. + * gVCFs to GenotypeGVCFs.
* *Input
*- * One or more Haplotype Caller gVCFs to combine. + * Two or more Haplotype Caller gVCFs to combine. *
* *Output
*- * A combined VCF. + * A combined multisample gVCF. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T CombineGVCFs \ - * --variant gvcf1.vcf \ - * --variant gvcf2.vcf \ - * -o mergeGvcf.vcf + * -R reference.fasta \ + * --variant sample1.g.vcf \ + * --variant sample2.g.vcf \ + * -o cohort.g.vcf ** + *Caveat
+ *Only gVCF files produced by HaplotypeCaller (or CombineGVCFs) can be used as input for this tool. Some other + * programs produce files that they call gVCFs but those lack some important information (accurate genotype likelihoods + * for every position) that GenotypeGVCFs requires for its operation.
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=1)) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index ea7c4671c..032ee5f0c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -85,22 +85,19 @@ import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import java.util.*; /** - * Genotypes any number of gVCF files that were produced by the Haplotype Caller into a single joint VCF file. + * Perform joint genotyping on gVCF files produced by HaplotypeCaller * *- * GenotypeGVCFs merges gVCF records that were produced as part of the reference model-based variant discovery pipeline (see documentation for more details) using - * the '-ERC GVCF' or '-ERC BP_RESOLUTION' mode of the HaplotypeCaller. This tool performs the multi-sample joint aggregation - * step and merges the records together in a sophisticated manner. - * - * At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods, - * re-genotype the newly merged record, and then re-annotate it. - * - * Note that this tool cannot work with just any gVCF files - they must have been produced with the HaplotypeCaller, - * which uses a sophisticated reference model to produce accurate genotype likelihoods for every position in the target. + * GenotypeGVCFs merges gVCF records that were produced as part of the Best Practices workflow for variant discovery + * (see Best Practices documentation for more details) using the '-ERC GVCF' or '-ERC BP_RESOLUTION' mode of the + * HaplotypeCaller, or result from combining such gVCF files using CombineGVCFs. This tool performs the multi-sample + * joint aggregation step and merges the records together in a sophisticated manner: at each position of the input + * gVCFs, this tool will combine all spanning records, produce correct genotype likelihoods, re-genotype the newly + * merged record, and then re-annotate it.
* *Input
*- * One or more Haplotype Caller gVCFs to genotype. + * One or more HaplotypeCaller gVCFs to genotype. *
* *Output
@@ -108,16 +105,25 @@ import java.util.*; * A combined, genotyped VCF. * * - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T GenotypeGVCFs \ - * --variant gvcf1.vcf \ - * --variant gvcf2.vcf \ + * -R reference.fasta \ + * --variant sample1.g.vcf \ + * --variant sample2.g.vcf \ * -o output.vcf ** + *Caveat
+ *Only gVCF files produced by HaplotypeCaller (or CombineGVCFs) can be used as input for this tool. Some other + * programs produce files that they call gVCFs but those lack some important information (accurate genotype likelihoods + * for every position) that GenotypeGVCFs requires for its operation.
+ * + *Special note on ploidy
+ *This tool is able to handle any ploidy (or mix of ploidies) intelligently; there is no need to specify ploidy + * for non-diploid organisms.
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=-10,stop=10)) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java index 8b94a56a6..134f5e514 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java @@ -84,13 +84,14 @@ import java.util.HashSet; import java.util.Set; /** - * Regenotypes the variants from a VCF. VCF records must contain PLs or GLs. + * Regenotypes the variants from a VCF containing PLs or GLs. * *- * This tool triggers re-genotyping of the samples through the Exact Allele Frequency calculation model. Note that this is truly the - * mathematically correct way to select samples from a larger set (especially when calls were generated from low coverage sequencing data); - * using the hard genotypes to select (i.e. the default mode of SelectVariants) can lead to false positives when errors are confused for - * variants in the original genotyping. This functionality used to comprise the --regenotype option in SelectVariants but we pulled it out + * This tool triggers re-genotyping of the samples through the Exact Allele Frequency calculation model. Note that + * this is truly the mathematically correct way to select samples from a larger set (especially when calls were + * generated from low coverage sequencing data); using the hard genotypes to select (i.e. the default mode of + * SelectVariants) can lead to false positives when errors are confused for variants in the original genotyping. + * This functionality used to comprise the --regenotype option in SelectVariants but we pulled it out * into its own tool for technical purposes. * *
Input
@@ -103,11 +104,11 @@ import java.util.Set; * A re-genotyped VCF. * * - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T RegenotypeVariants \ + * -R reference.fasta \ * --variant input.vcf \ * -o output.vcf *diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java index 05b770005..328960390 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java @@ -43,7 +43,7 @@ import java.util.*; /** * All command line parameters accepted by all tools in the GATK. * - *Info for general users
+ *Info for end users
* *This is a list of options and parameters that are generally available to all tools in the GATK.
* @@ -51,7 +51,7 @@ import java.util.*; * argument is only meant to be used with a subset of tools, and the -pedigree argument will only be effectively used * by a subset of tools as well. Some arguments conflict with others, and some conversely are dependent on others. This * is all indicated in the detailed argument descriptions, so be sure to read those in their entirety rather than just - * skimming the one-line summaey in the table. + * skimming the one-line summary in the table. * *Info for developers
* diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java index ddcf373e1..4fec3e240 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java @@ -63,10 +63,11 @@ public class GATKArgumentCollection { @Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false) public ListsamFiles = new ArrayList<>(); - @Hidden + @Advanced @Argument(fullName = "showFullBamList",doc="Emit a log entry (level INFO) containing the full list of sequence data files to be included in the analysis (including files inside .bam.list files).") public Boolean showFullBamList = false; + @Advanced @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0) public Integer readBufferSize = null; @@ -79,11 +80,11 @@ public class GATKArgumentCollection { /** * By default, GATK generates a run report that is uploaded to a cloud-based service. This report contains basic * statistics about the run (which tool was used, whether the run was successful etc.) that help us for debugging - * and development. Up to version 3.2-2 the run report contains a record of the username and hostname associated + * and development. Up to version 3.3-0 the run report contains a record of the username and hostname associated * with the run, but it does **NOT** contain any information that could be used to identify patient data. * Nevertheless, if your data is subject to stringent confidentiality clauses (no outside communication) or if your * run environment is not connected to the internet, you can disable the reporting system by seeting this option to - * "NO_ET". You will also need to request a key using the online request form on our website (se FAQs). + * "NO_ET". You will also need to request a key using the online request form on our website (see FAQs). */ @Argument(fullName = "phone_home", shortName = "et", doc="Run reporting mode", required = false) public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; @@ -95,9 +96,10 @@ public class GATKArgumentCollection { public File gatkKeyFile = null; /** - * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary tag that can be - * used to group together runs during later analysis. One use of this capability is to tag runs as GATK - * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. + * The GATKRunReport supports tagging GATK runs with an arbitrary tag that can be + * used to group together runs during later analysis (as of GATK 2.2) . One use of this capability is to tag + * runs as GATK performance tests, so that the performance of the GATK over time can be assessed from the logs + * directly. * * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find * meaningful. @@ -164,9 +166,9 @@ public class GATKArgumentCollection { // // -------------------------------------------------------------------------------------------------------------- /** - * There are several ways to downsample reads, i.e. to removed reads from the pile of reads that will be used for analysis. - * See the documentation of the individual downsampling options for details on how they work. Note that Many GATK tools - * specify a default downsampling type and target, but this behavior can be overridden from command line using the + * There are several ways to downsample reads, i.e. to remove reads from the pile of reads that will be used for analysis. + * See the documentation of the individual downsampling options for details on how they work. Note that many GATK tools + * specify a default downsampling type and target, but this behavior can be overridden from the command line using the * downsampling arguments. */ @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of read downsampling to employ at a given locus", required = false) @@ -233,11 +235,13 @@ public class GATKArgumentCollection { // BAQ arguments // // -------------------------------------------------------------------------------------------------------------- + @Advanced @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; /** * Phred-scaled gap open penalty for BAQ calculation. Although the default value is 40, a value of 30 may be better for whole genome call sets. */ + @Advanced @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0) public double BAQGOP = BAQ.DEFAULT_GOP; @@ -328,7 +332,7 @@ public class GATKArgumentCollection { * Any value greater than zero will be used to recalculate the quantization using that many levels. * Negative values mean that we should quantize using the recalibration report's quantization level. */ - @Hidden + @Advanced @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) public int quantizationLevels = 0; @@ -352,11 +356,13 @@ public class GATKArgumentCollection { * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. */ + @Advanced @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)", required = false, minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; /** * If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score. */ + @Advanced @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false) public double globalQScorePrior = -1.0; @@ -398,16 +404,16 @@ public class GATKArgumentCollection { /** * For expert users only who know what they are doing. We do not support usage of this argument, so we may refuse to help you if you use it and something goes wrong. The one exception to this rule is ALLOW_N_CIGAR_READS, which is necessary for RNAseq analysis. */ + @Advanced @Argument(fullName = "unsafe", shortName = "U", doc = "Enable unsafe operations: nothing will be checked at runtime", required = false) public ValidationExclusion.TYPE unsafe; /** - * UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking + * Not recommended for general use. Disables both auto-generation of index files and index file locking * when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index * generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it * for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general * because it allows reading from index files without first acquiring a lock. */ - @Hidden @Advanced @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", doc = "Disable both auto-generation of index files and index file locking", @@ -451,6 +457,7 @@ public class GATKArgumentCollection { required = false) public boolean simplifyBAM = false; + @Advanced @Argument(fullName = "disable_bam_indexing", doc = "Turn off on-the-fly creation of indices for output BAM files.", required = false) public boolean disableBAMIndexing = false; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java index 05c02c722..0b7d1a905 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java @@ -33,16 +33,29 @@ import htsjdk.samtools.SAMRecord; import java.util.Iterator; /** - * Filter out reads with wonky cigar strings. + * Filter out reads with wonky CIGAR strings * - * - No reads with a different length and cigar length - * - No reads with Hard/Soft clips in the middle of the cigar - * - No reads starting with deletions (with or without preceding clips) - * - No reads ending in deletions (with or without follow-up clips) - * - No reads that are fully hard or soft clipped - * - No reads that have consecutive indels in the cigar (II, DD, ID or DI) + * This read filter will filter out the following cases:
+ *+ *
* - * ps: apparently an empty cigar is okay... + *- different length and cigar length
+ *- Hard/Soft clips in the middle of the cigar
+ *- starting with deletions (with or without preceding clips)
+ *- ending in deletions (with or without follow-up clips)
+ *- fully hard or soft clipped
+ *- consecutive indels in the cigar (II, DD, ID or DI)
+ *Usage example
+ * + *Enable the bad cigar filter
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf BadCigar + ** * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java index c25d8d9ca..562e50ea9 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java @@ -28,13 +28,40 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads whose mate maps to a different contig. + * Filter out reads whose mate maps to a different contig + * + *This filter is intended to ensure that only reads that are likely to be mapped in the right place, and therefore + * to be informative, will be used in analysis. If mates in a pair are mapping to different contigs, it is likely that + * at least one of them is in the wrong place. One exception is you are using a draft genome assembly in which the + * chromosomes are fragmented into many contigs; then you may legitimately have reads that are correctly mapped but are + * on different contigs than their mate. This read filter can be disabled from the command line using the -drf argument. + *
+ * + *Enable the bad mate filter
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf BadMate + *+ * + *Disable the bad mate filter
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -drf BadMate + ** * @author ebanks * @version 0.1 */ -public class BadMateFilter extends ReadFilter { +public class BadMateFilter extends DisableableReadFilter { public boolean filterOut(final SAMRecord rec) { return hasBadMate(rec); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java index aa45b250a..310f1dee3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java @@ -53,7 +53,32 @@ import htsjdk.samtools.SAMRecord; */ /** - * Filter out duplicate reads. + * Filter out duplicate reads + * + *This filter recognizes the SAM flag set by MarkDuplicates. It can be disabled from the command line if needed + * using the -drf argument.
+ * + *Usage examples
+ * + *Enable the duplicate read filter
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf DuplicateRead + *+ * + *Disable the duplicate read filter
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -drf DuplicateRead + ** * @author rpoplin * @since Dec 9, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java index 2cc5e2a8b..fc5cdcb53 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java @@ -28,7 +28,9 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads that fail the vendor quality check. + * Filter out reads that fail the vendor quality check + * + *This filter recognizes the SAM flag corresponding to the vendor quality check.
* * @author rpoplin * @since Jul 19, 2010 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java index 8b0f07624..d6e78a616 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java @@ -33,6 +33,22 @@ import org.broadinstitute.gatk.engine.filters.ReadFilter; /** * Only use reads from the specified library * + *This filter is useful for running on only a subset of the data as identified by a read group property. + * In the case of the library filter, the goal is usually to run quality control checks on a particular library.
+ * + *Usage example
+ * + *Enable the library read filter
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf LibraryRead \ + * -library library_name + *+ * * @author kcibul * @since Aug 15, 2012 * diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java index 6488a857a..05c6f564e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java @@ -34,7 +34,26 @@ import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; import org.broadinstitute.gatk.utils.exceptions.UserException; /** - * Filter out malformed reads. + * Filter out malformed reads + * + *This filter is applied automatically by all GATK tools in order to protect them from crashing on reads that are + * grossly malformed. There are a few issues (such as the absence of sequence bases) that will cause the run to fail with an + * error, but these cases can be preempted by setting flags that cause the problem reads to also be filtered.
+ * + *Usage example
+ * + *Set the malformed read filter to filter out reads that have no sequence bases
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -filterNoBases + *+ * + *Note that the MalformedRead filter itself does not need to be specified in the command line because it is set + * automatically.
* * @author mhanna * @version 0.1 @@ -46,14 +65,14 @@ public class MalformedReadFilter extends ReadFilter { private SAMFileHeader header; - @Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "filter out reads with CIGAR containing the N operator, instead of stop processing and report an error.", required = false) + @Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false) boolean filterReadsWithNCigar = false; - @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false) + @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false) boolean filterMismatchingBaseAndQuals = false; - @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "if a read has no stored bases (i.e. a '*'), filter out the read instead of blowing up.", required = false) + @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false) boolean filterBasesNotStored = false; /** diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java index 67c62b975..58ec76660 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java @@ -29,7 +29,23 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filter out reads with low mapping qualities. + * Filter out reads with low mapping qualities + * + *This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.
+ * + *Usage example
+ * + *Set the mapping quality filter to filter out reads that have MAPQ < 15
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T HaplotypeCaller \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.vcf \ + * -rf MappingQuality \ + * -mmq 15 + ** * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java index 05df7fb0d..ff1542e41 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java @@ -29,7 +29,22 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.QualityUtils; /** - * Filter out mapping quality zero reads. + * Filter out reads with no mapping quality information + * + * + *This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf MappingQualityUnavailable + ** * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java index f3f703278..b0d40c074 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java @@ -28,7 +28,21 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out mapping quality zero reads. + * Filter out reads with mapping quality zero + * + *This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf MappingQualityZero + ** * @author hanna * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java index c6a79e1a3..20dda5427 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java @@ -28,7 +28,30 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads that are not paired, have their mate unmapped, are duplicates, fail vendor quality check or both mate and read are in the same strand. + * Filter out reads with bad pairing (and related) properties + * + *This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis. + * The following cases will be filtered out: + *
+ *+ *
+ * + *- is not paired
+ *- mate is unmapped
+ *- is duplicate
+ *- fails vendor quality check
+ *- both mate and read are in the same strand orientation
+ *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf MateSameStrand + ** * @author chartl * @since 5/18/11 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java index cca05ebc7..c7b512f2b 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java @@ -29,13 +29,28 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filter out reads that exceed a given max insert size + * Filter out reads that exceed a given insert size + * + *This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf MaxInsertSize \ + * -maxInsert 10000 + ** * @author chartl * @since 5/2/11 */ public class MaxInsertSizeFilter extends ReadFilter { - @Argument(fullName = "maxInsertSize", shortName = "maxInsert", doc="Discard reads with insert size greater than the specified value, defaults to 1000000", required=false) + @Argument(fullName = "maxInsertSize", shortName = "maxInsert", doc="Insert size cutoff", required=false) private int maxInsertSize = 1000000; public boolean filterOut(SAMRecord record) { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java index 21b291bb3..0a7a2cdbf 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java @@ -28,7 +28,21 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads without read groups. + * Filter out reads without read group information + * + *Many GATK tools are dependent on having read group information in order to operate correctly. This filter excludes + * any reads that have not been appropriately identified.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf MissingReadGroup + ** * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java index 8297903d8..4e8a1dc2b 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java @@ -27,33 +27,22 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ /** - * Filter out reads that don't have base an original quality quality score tag (usually added by BQSR) + * Filter out reads that do not have an original quality quality score (OQ) tag + * + *The OQ tag can be added during the base recalibration process in order to preserve original information.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf NoOriginalQualityScores + ** * @author rpoplin * @since Nov 19, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java index 4c8f412e2..55a697d3e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java @@ -28,7 +28,22 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads that are secondary alignments (not one of the best alignments) + * Filter out reads that are secondary alignments + * + *This filter recognizes the SAM flag that identifies secondary alignments (ie not the best alignment). + * It is intended to ensure that only reads that are likely to be mapped in the right place, and therefore to be + * informative, will be used in analysis.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf NotPrimaryAlignment + ** * @author rpoplin * @since Dec 9, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java index 79f16a5fc..f1b375835 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java @@ -30,7 +30,21 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; /** - * Filter out 454 reads. + * Filter out reads produced by 454 technology + * + *Reads produced by 454 technology should not be processed by the GATK's indel realignment tools. This filter is + * applied by those tools to enforce that rule.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf Platform454 + ** * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java index 8236cc219..7ca07d35d 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java @@ -31,7 +31,23 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; /** - * Filter out PL matching reads. + * Filter out reads that were generated by a specific sequencing platform + * + *This filter is useful for running on only a subset of the data as identified by a read group property. + * In the case of the platform filter, the goal is usually to blacklist certain sequencing technologies at certain processing steps + * if we know there is an incompatibility problem (like 454 and indel realignment, which is special-cased).
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf Platform \ + * -PLFilterName platform_name + ** * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java index 4a6781ff5..b0e0bbebb 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java @@ -1,28 +1,28 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMReadGroupRecord; @@ -33,7 +33,11 @@ import java.util.HashSet; import java.util.Set; /** - * Filter out reads that have blacklisted platform unit tags. (See code documentation for how to create the blacklist). + * Filter out reads with blacklisted platform unit tags + * + *This filter is useful for running on only a subset of the data as identified by a read group property. + * In the case of the platform unit filter, the goal is usually to blacklist certain runs if we know there was a problem with + * a particular sequencing machine.
* * @author asivache * @since Sep 21, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java index 7c6bfb0e3..9f815cf72 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java @@ -36,10 +36,23 @@ import java.util.*; import java.util.Map.Entry; /** - * Removes records matching the read group tag and exact match string. - * For example, this filter value: - * PU:1000G-mpimg-080821-1_1 - * would filter out a read with the read group PU:1000G-mpimg-080821-1_1 + * Filter out reads matching a read group tag value + * + *This filter is useful for running on only a subset of the data as identified by a read group property, + * using expression matching against the read group tags.
+ * + *Usage example
+ * + *Set the read group filter to blacklist read groups that have the PU tag "1000G-mpimg-080821-1_1"
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf ReadGroupBlackList \ + * -rgbl PU:1000G-mpimg-080821-1_1 + **/ public class ReadGroupBlackListFilter extends ReadFilter { private Set>> filterEntries; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java index 1e44df806..f9a6fab57 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java @@ -29,7 +29,22 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filters out reads whose length is >= some value or < some value. + * Filter out reads based on length + * + * This filter is useful for running on only reads that are longer (or shorter) than the given threshold sizes.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf ReadLength \ + * -minRead 50 \ + * -maxRead 101 + ** * @author mhanna * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java index 23a5151de..cdee7e14b 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java @@ -29,13 +29,28 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filter out all reads except those with this read name + * Only use reads with this read name + * + *This filter is useful for isolating a particular read, pair of reads or or set of alignments for a given read + * when troubleshooting issues where the error message provided a culprit name.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf ReadName \ + * -rn read_name + ** * @author chartl * @since 9/19/11 */ public class ReadNameFilter extends ReadFilter { - @Argument(fullName = "readName", shortName = "rn", doc="Filter out all reads except those with this read name", required=true) + @Argument(fullName = "readName", shortName = "rn", doc="Read name to whitelist", required=true) private String readName; public boolean filterOut(final SAMRecord rec) { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java index fd2876654..292803d1c 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java @@ -29,7 +29,23 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filters out reads whose strand is negative or positive + * Filter out reads based on strand orientation + * + *This filter is useful for isolating reads from only forward or reverse strands. By default, it filters out reads + * from the negative (reverse) strand. This logic can be reversed by using the -filterPositive flag.
+ * + *Usage example
+ * + *Set the read strand filter to filter out positive (forward) strand reads
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf ReadStrand \ + * -filterPositive + ** * @author chartl * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java index 0c8a93a83..89be38db7 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java @@ -29,7 +29,7 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * A read filter (transformer) that sets all reads mapping quality to a given value. + * Set the mapping quality of all reads to a given value. * ** If a BAM file contains erroneous or missing mapping qualities (MAPQ), this read transformer will set all your @@ -55,16 +55,18 @@ import org.broadinstitute.gatk.utils.commandline.Argument; * BAM file(s) *
* - * *Output
*- * BAM file(s) with all reads mapping qualities reassigned + * BAM file(s) with the mapping qualities of all reads reassigned to the specified value *
* - *Examples
+ *Usage example
** java -jar GenomeAnalysisTK.jar \ * -T PrintReads \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ * -rf ReassignMappingQuality \ * -DMQ 35 *diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java index f07f197c6..2ff1d5a4e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java @@ -29,7 +29,7 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * A read filter (transformer) that changes a given read mapping quality to a different value. + * Set the mapping quality of reads with a given value to another given value. * ** This read transformer will change a certain read mapping quality to a different value without affecting reads that @@ -57,12 +57,15 @@ import org.broadinstitute.gatk.utils.commandline.Argument; * BAM file(s) with one read mapping quality selectively reassigned as desired *
* - *Examples
+ *Usage example
*- * java -jar GenomeAnalysisTK.jar - * -T PrintReads - * -rf ReassignOneMappingQuality - * -RMQF 255 + * java -jar GenomeAnalysisTK.jar \ + * -T PrintReads \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf ReassignOneMappingQuality \ + * -RMQF 255 \ * -RMQT 60 ** diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java index 2ec0112ab..ab63e1e00 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java @@ -32,7 +32,23 @@ import org.broadinstitute.gatk.utils.commandline.Argument; import java.util.Set; /** - * Filter out all reads except those with this sample + * Only use reads belonging to a specific sample + * + *This filter is useful for isolating data from one particular sample in a multisample file.
+ * + *Usage example
+ * + *Use only reads from the sample named NA12878
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf Sample \ + * -goodSM NA12878 + *+ * */ public class SampleFilter extends ReadFilter { @Argument(fullName = "sample_to_keep", shortName = "goodSM", doc="The name of the sample(s) to keep, filtering out all others", required=true) diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java index 5a9d21476..58cf9183d 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java @@ -30,7 +30,22 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Only use reads from the specified read group. + * Only use reads from the specified read group + * + *This filter is useful for isolating data from one particular read group (usually a single lane).
+ * + *Usage example
+ * + *Use only reads from the read group with ID "read_group_1
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf SingleReadGroup \ + * -goodRG read_group_1 + ** * @author rpoplin * @since Nov 27, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java index e9cc30276..d5f8d30ff 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java @@ -28,7 +28,22 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out unmapped reads. + * Filter out unmapped reads + * + * + *This filter recognizes the SAM flag corresponding to being unmapped. It is intended to ensure that only + * reads that are likely to be mapped in the right place, and therefore to be informative, will be used in analysis.
+ * + *Usage example
+ * + *+ * java -jar GenomeAnalysisTk.jar \ + * -T ToolName \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.file \ + * -rf UnmappedRead + ** * @author rpoplin * @since Dec 9, 2009 diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java index 155566aea..391b0202f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java @@ -56,24 +56,23 @@ import java.util.*; /** * - * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples + * Concatenate VCF files of non-overlapping genome intervals, all with the same set of samples * ** The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. * This tool concatenates the scattered output VCF files. It assumes that: - * - All the input VCFs (or BCFs) contain the same samples in the same order. - * - The variants in each input file are from non-overlapping (scattered) intervals. - * - * When the input files are already sorted based on the intervals start positions, use -assumeSorted. - * - * Note: Currently the tool is more efficient when working with VCFs; we will work to make it as efficient for BCFs. - * + *
+ *
* + *- All the input VCFs (or BCFs) contain the same samples in the same order.
+ *- The variants in each input file are from non-overlapping (scattered) intervals.
+ *When the input files are already sorted based on the intervals start positions, use -assumeSorted.
* *Input
*- * One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order). - * If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag. + * Two or more variant sets to combine. They should be of non-overlapping genome intervals and with the same + * samples (sorted in the same order). If the files are ordered according to the appearance of intervals in the ref + * genome, then one can use the -assumeSorted flag. *
* *Output
@@ -86,16 +85,19 @@ import java.util.*; * invoke it is a little different from other GATK tools (see example below), and it does not accept any of the * classic "CommandLineGATK" arguments. * - *Example
+ *Usage example
** java -cp GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants \ - * -R ref.fasta \ + * -R reference.fasta \ * -V input1.vcf \ * -V input2.vcf \ * -out output.vcf \ * -assumeSorted ** + *Caveat
+ *Currently the tool is more efficient when working with VCFs than with BCFs.
+ * * @author Ami Levy Moonshine * @since Jan 2012 */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java index 19c0d2697..1c99fa8fc 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java @@ -65,7 +65,7 @@ import java.util.Set; **
*- This annotation will only work properly for biallelic heterozygous calls.
*- This annotation cannot currently be calculated for indels.
- *- tThe reasoning underlying this annotation only applies to germline variants in DNA sequencing data. In somatic/cancer analysis, divergent ratios are expected due to tumor heterogeneity. In RNAseq analysis, divergent ratios may indicate differential allele expression.
+ *- The reasoning underlying this annotation only applies to germline variants in DNA sequencing data. In somatic/cancer analysis, divergent ratios are expected due to tumor heterogeneity. In RNAseq analysis, divergent ratios may indicate differential allele expression.
*- As stated above, this annotation is experimental and should be interpreted with caution as we cannot guarantee that it is appropriate. Basically, use it at your own risk.
*Related annotations
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java index 995279927..1d4b7a002 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java @@ -49,8 +49,7 @@ import java.util.Map; *This annotation tells you what fraction of reads have a mapping quality of less than the given threshold of 10 (including 0). Note that certain tools may impose a different minimum mapping quality threshold. For example, HaplotypeCaller excludes reads with MAPQ<20.
* *Calculation
- *$$ LowMQ = \frac{# reads with MAPQ=0 + # reads with MAPQ<10}{total # reads} $$ - *
+ * $$ LowMQ = \frac{# reads with MAPQ=0 + # reads with MAPQ<10}{total # reads} $$ * *Related annotations
*diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java index 5e632dc7c..13057381e 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java @@ -49,6 +49,9 @@ import java.util.List; * *
This annotation gives you the count of all reads that have MAPQ = 0 for each sample. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.
* + *Caveat
+ *This annotation is excluded by HaplotypeCaller because HC filters out all reads with MQ0 upfront, so the annotation would always return a value of 0 anyway.
+ * *Related annotations
**
- MappingQualityZero gives the count of reads with MAPQ=0 across all samples.
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java index 5ec474119..465fa285f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java @@ -48,7 +48,8 @@ import java.util.Map; * *N occurs in a sequence when the sequencer does not have enough information to determine which base it should call. The presence of many Ns at the same site lowers our confidence in any calls made there, because it suggests that there was some kind of technical difficulty that interfered with the sequencing process.
* - *Note that in GATK versions 3.2 and earlier, this annotation only counted N bases from reads generated with SOLiD technology. This functionality was generalized for all sequencing platforms in GATK version 3.3.
+ *Note
+ *In GATK versions 3.2 and earlier, this annotation only counted N bases from reads generated with SOLiD technology. This functionality was generalized for all sequencing platforms in GATK version 3.3.
* *Related annotations
*diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java index 90cd9ec47..9da3de861 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java @@ -46,9 +46,9 @@ import java.util.regex.Pattern; /** * Top effect from SnpEff functional predictions * - *
This annotation processes the output of the SnpEff functional prediction tool to select only the predicted effect with the highest biological impact. The SnpEff output must be provided on the command line by specifying "--snpEffFile filename.vcf". See http://snpeff.sourceforge.net/ for more information about the SnpEff tool
. + *This annotation processes the output of the SnpEff functional prediction tool to select only the predicted effect with the highest biological impact. The SnpEff output must be provided on the command line by specifying "--snpEffFile filename.vcf". See http://snpeff.sourceforge.net/ for more information about the SnpEff tool.
* - *Caveats
+ *Caveat
* ** diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java index f2d60bb02..c748f75ce 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java @@ -50,11 +50,13 @@ import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import java.util.*; /** - * Annotates variant calls with context information. + * Annotate variant calls with context information * *
- This annotation currently only supports output from SnpEff version 2.0.5.
- * VariantAnnotator is a GATK tool for annotating variant calls based on their context. - * The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself. + * This tool is designed to annotate variant calls based on their context (ass opposed to functional annotation). + * Various annotation modules are available; see the + * documentation + * for a complete list. * *
Input
*@@ -66,15 +68,15 @@ import java.util.*; * An annotated VCF. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ + * -R reference.fasta \ * -T VariantAnnotator \ * -I input.bam \ * -o output.vcf \ * -A Coverage \ - * --variant input.vcf \ + * -V input.vcf \ * -L input.vcf \ * --dbsnp dbsnp.vcf *diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java index 332486b1a..cc12172a1 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java @@ -47,25 +47,25 @@ import java.io.PrintStream; /** - * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome - * + * Collect statistics on callable, uncallable, poorly mapped, and other parts of the genome + * *- * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system + * A very common question about a NGS set of reads is what areas of the genome are considered callable. This tool * considers the coverage at each locus and emits either a per base state or a summary interval BED file that * partitions the genomic intervals into the following callable states: *
*
* * @@ -76,22 +76,19 @@ import java.io.PrintStream; * *- REF_N
- *- the reference base was an N, which is not considered callable the GATK
+ *- The reference base was an N, which is not considered callable the GATK
*- PASS
- *- the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ *- The base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
*- NO_COVERAGE
- *- absolutely no reads were seen at this locus, regardless of the filtering parameters
+ *- Absolutely no reads were seen at this locus, regardless of the filtering parameters
*- LOW_COVERAGE
- *- there were less than min. depth bases at the locus, after applying filters
+ *- There were fewer than min. depth bases at the locus, after applying filters
*- EXCESSIVE_COVERAGE
- *- more than -maxDepth read at the locus, indicating some sort of mapping problem
+ *- More than -maxDepth read at the locus, indicating some sort of mapping problem
*- POOR_MAPPING_QUALITY
- *- more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *- More than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
*Output
*- *
- *
+ * A file with the callable status covering each base and a table of callable status x count of all examined bases * - * - *- -o: a OutputFormatted (recommended BED) file with the callable status covering each base
- *- -summary: a table of callable status x count of all examined bases
- *Examples
+ *Usage example
** java -jar GenomeAnalysisTK.jar \ * -T CallableLoci \ - * -I my.bam \ - * -summary my.summary \ - * -o my.bed + * -R reference.fasta \ + * -I myreads.bam \ + * -summary table.txt \ + * -o callable_status.bed ** - * would produce a BED file (my.bed) that looks like: + * would produce a BED file that looks like: * ** 20 10000000 10000864 PASS @@ -107,14 +104,13 @@ import java.io.PrintStream; * 20 10012552 10012554 PASS * 20 10012555 10012557 LOW_COVERAGE * 20 10012558 10012558 PASS - * et cetera... ** as well as a summary table that looks like: * ** state nBases * REF_N 0 - * PASS 996046 + * PASS 996046 * NO_COVERAGE 121 * LOW_COVERAGE 928 * EXCESSIVE_COVERAGE 0 @@ -131,7 +127,7 @@ public class CallableLoci extends LocusWalkerThis tool can be used to evaluate how different sequence datasets compare in terms of "callability" + * based on the output of the CallableLoci tool. + * + * + * Input
+ *+ * Two files to compare, output by two runs of CallableLoci + *
+ * + *Output
+ *+ * A table showing the callability status of each interval of interest in the two comparison sets and whether they match. + *
+ * + *Usage example
+ *+ * java -jar GenomeAnalysisTK.jar \ + * -R reference.fasta \ + * -T CompareCallableLoci \ + * -comp1 callable_loci_1.bed \ + * -comp2 callable_loci_2.bed \ + * [-L input.intervals \] + * -o comparison.table + *+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class CompareCallableLoci extends RodWalker, long[][]> { @@ -103,7 +129,7 @@ public class CompareCallableLoci extends RodWalker
bindings = tracker.getValues(rodBinding); if ( bindings.size() != 1 ) { - throw new UserException.MalformedFile(String.format("%s track isn't a properly formated CallableBases object!", rodBinding.getName())); + throw new UserException.MalformedFile(String.format("%s track isn't a properly formatted CallableBases object!", rodBinding.getName())); } BEDFeature bed = bindings.get(0); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java index 92395a4f9..098e81c94 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java @@ -62,46 +62,37 @@ import java.util.*; * This tool processes a set of bam files to determine coverage at different levels of partitioning and * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, - * and/or percentage of bases covered to or beyond a threshold. - * Additionally, reads and bases can be filtered by mapping or base quality score. + * and/or percentage of bases covered to or beyond a threshold. Additionally, reads and bases can be filtered by + * mapping or base quality score. + * * *
Input
- *- * One or more bam files (with proper headers) to be analyzed for coverage statistics - *
- *- *(Optional) A REFSEQ Rod to aggregate coverage to the gene level - *
- * (for information about creating the REFSEQ Rod, please consult the online documentation) - *
+ *+ *
+ *- One or more bam files (with proper headers) to be analyzed for coverage statistics
+ *- (Optional) A REFSEQ file to aggregate coverage to the gene level (for information about creating the REFSEQ Rod, please consult the online documentation)
+ *Output
** Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: - *
- * - no suffix: per locus coverage - *
- * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases - *
- * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases - *
- * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval - *
- * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples - *
- * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene - *
- * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples - *
- * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases - *
- * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases *
+ *+ *
* - *- no suffix: per locus coverage
+ *- _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
+ *- _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
+ *- _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
+ *- _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
+ *- _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
+ *- _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
+ *- _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
+ *- _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
+ *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T DepthOfCoverage \ + * -R reference.fasta \ * -o file_name_base \ * -I input_bams.list * [-geneList refSeq.sorted.txt] \ @@ -180,7 +171,7 @@ public class DepthOfCoverage extends LocusWalker* * - *Examples
+ *Usage example
*- * java - * -jar GenomeAnalysisTK.jar - * -T ReadGroupProperties - * -I example1.bam -I example2.bam etc - * -R reference.fasta - * -o example.gatkreport.txt + * java -jar GenomeAnalysisTK.jar \ + * -T ReadGroupProperties \ + * -R reference.fasta \ + * -I example1.bam \ + * -I example2.bam \ + * -o readgroup_report.grp ** * @author Mark DePristo diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java index 438b38e36..cfebdd29a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java @@ -44,10 +44,10 @@ import java.util.Map; import java.util.TreeMap; /** - * Outputs the read lengths of all the reads in a file. + * Collect read length statistics * *- * Generates a table with the read lengths categorized per sample. If the file has no sample information + * This tool generates a table with the read lengths categorized per sample. If the file has no sample information * (no read groups) it considers all reads to come from the same sample. *
* @@ -59,16 +59,15 @@ import java.util.TreeMap; * *Output
*- * A human/R readable table of tab separated values with one column per sample and one row per read. + * A human/R-readable table of tab-separated values with one column per sample and one row per read. *
* - *Examples
+ *Usage example
*- * java - * -jar GenomeAnalysisTK.jar - * -T ReadLengthDistribution - * -I example.bam - * -R reference.fasta + * java -jar GenomeAnalysisTK.jar \ + * -T ReadLengthDistribution \ + * -R reference.fasta \ + * -I example.bam \ * -o example.tbl ** diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java index c6e451cd2..8ec22e5ea 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java @@ -51,41 +51,41 @@ import java.util.Set; /** - * Generates an alternative reference sequence over the specified interval. + * Generate an alternative reference sequence over the specified interval * - *- * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). - * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + *
Given a variant callset, this tool replaces the reference bases at variation sites with the bases supplied in the + * corresponding callset records. Additionally, it allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
* - * The output format can be partially controlled using the provided command-line arguments. + *The output format can be partially controlled using the provided command-line arguments. * Specify intervals with the usual -L argument to output only the reference bases within your intervals. * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a - * separate fasta sequence (named numerically in order). - * - * Several important notes: - * 1) if there are multiple variants that start at a site, it chooses one of them randomly. - * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. - * 3) this tool works only for SNPs and for simple indels (but not for things like complex substitutions). - * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). + * separate fasta sequence (named numerically in order).
* + *Caveats
+ *+ *
+ *- If there are multiple variants that start at a site, it chooses one of them randomly.
+ *- When there are overlapping indels (but with different start positions) only the first will be chosen.
+ *- This tool works only for SNPs and for simple indels (but not for things like complex substitutions).
+ *Input
*- * The reference, requested intervals, and any number of variant rod files. + * The reference, requested intervals, and any number of variant ROD files. *
* *Output
*- * A fasta file representing the requested intervals. + * A FASTA file representing the requested intervals. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T FastaAlternateReferenceMaker \ + * -R reference.fasta \ * -o output.fasta \ * -L input.intervals \ - * --variant input.vcf \ + * -V input.vcf \ * [--snpmask mask.vcf] ** diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java index 562f00bf4..08ab3019a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java @@ -40,13 +40,13 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.io.PrintStream; /** - * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. + * Create a subset of a FASTA reference sequence * - *- * The output format can be partially controlled using the provided command-line arguments. - * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + *
This tool creates a new reference in FASTA format consisting of only those positions or intervals + * provided in the input data set. The output format can be partially controlled using the provided command-line + * arguments. Specify intervals with the usual -L argument to output only the reference bases within your intervals. * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a - * separate fasta sequence (named numerically in order). + * separate fasta sequence (named numerically in order).
* *Input
*@@ -58,11 +58,11 @@ import java.io.PrintStream; * A fasta file representing the requested intervals. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T FastaReferenceMaker \ + * -R reference.fasta \ * -o output.fasta \ * -L input.intervals *diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java index e5178dd74..7215e2b7b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java @@ -52,11 +52,11 @@ import java.io.PrintStream; * Base counts are written to file if an output file name is given (with -o), otherwise output to stdout. * * - *Example
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T FastaStats \ - * -R ref.fasta \ + * -R reference.fasta \ * [-o output.txt] **/ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java index 33009b5b6..42276599f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java @@ -49,11 +49,12 @@ import java.util.*; /** - * Filters variant calls using a number of user-selectable, parameterizable criteria. + * Filter variant calls based on INFO and FORMAT annotations * *- * VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria. - * Records are hard-filtered by changing the value in the FILTER field to something other than PASS. + * This tool is designed for hard-filtering variant calls based on certain criteria. + * Records are hard-filtered by changing the value in the FILTER field to something other than PASS. Filtered records + * will be preserved in the output unless their removal is requested in the command line.
* *Input
*@@ -65,11 +66,11 @@ import java.util.*; * A filtered VCF. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T VariantFiltration \ + * -R reference.fasta \ * -o output.vcf \ * --variant input.vcf \ * --filterExpression "AB < 0.2 || MQ0 > 50" \ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCMappingQualityFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCMappingQualityFilter.java index ce6fe0633..f82985fd0 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCMappingQualityFilter.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCMappingQualityFilter.java @@ -32,7 +32,27 @@ import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.filters.ReadFilter; /** - * Filter out reads with low mapping qualities. + * Filter out reads with low mapping qualities for HaplotypeCaller + * + *This filter is applied by default for HaplotypeCaller and is designed to ensure that only reads that are likely + * to be informative will be used in the reassembly process. It performs the same basic function as the regular + * MappingQualityFilter, but it is used at specific points in the operation of HC where it is helpful + * to be able to apply a different quality threshold from the general case.
+ * + *Usage example
+ * + *Set the HC-specific mapping quality filter to filter out reads with MAPQ < 10
+ *+ * java -jar GenomeAnalysisTk.jar \ + * -T HaplotypeCaller \ + * -R reference.fasta \ + * -I input.bam \ + * -o output.vcf \ + * -mmq 10 + *+ * + *Note that the HCMappingQuality filter itself does not need to be specified in the command line because it is set + * automatically for HaplotypeCaller.
* * @author mdepristo */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java index bd31f09d8..b4e586fb3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.tools.walkers.help; import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang.StringEscapeUtils; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; @@ -38,10 +39,9 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.lang.annotation.Annotation; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public class WalkerDocumentationHandler extends GenericDocumentationHandler { private final static String CMDLINE_GATK_URL = HelpConstants.GATK_DOCS_URL + "org_broadinstitute_gatk_engine_CommandLineGATK.php"; @@ -122,7 +122,15 @@ public class WalkerDocumentationHandler extends GenericDocumentationHandler { for (Method classMethod : myClass.getMethods()) { if (classMethod.toString().contains("getDescriptions") && classMethod.toString().contains("annotator")) { try { - return classMethod.invoke(instance); + String headerLine = (classMethod.invoke(instance)).toString(); + Pattern p = Pattern.compile("(INFO=<.*?>|FORMAT=<.*?>)"); + Matcher m = p.matcher(headerLine); + ListannotLines = new ArrayList<>(); + while (m.find()) { + annotLines.add(StringEscapeUtils.escapeHtml(m.group())); + System.out.println("found "+m.group()); + } + return annotLines; } catch (IllegalArgumentException e) { } catch (IllegalAccessException e) { } catch (InvocationTargetException e) { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java index fd876991f..b76b9ff9c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java @@ -118,17 +118,17 @@ import java.util.Arrays; * * * Input
- *A BAM file conatining your aligned sequence data and a pileup file generated by Samtools covering the region you + *
A BAM file containing your aligned sequence data and a pileup file generated by Samtools covering the region you * want to examine.
* *Output
*A text file listing mismatches between the input pileup and the GATK's internal pileup. If there are no mismatches, the output file is empty.
* - *Example
+ *Usage example
** java -jar GenomeAnalysisTK.jar \ * -T CheckPileup \ - * -R ref.fasta \ + * -R reference.fasta \ * -I your_data.bam \ * --pileup:SAMPileup pileup_file.txt \ * -L chr1:257-275 \ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java index 8ba387ca5..023d103e7 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java @@ -36,7 +36,7 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; /** - * Walks over the input data set, calculating the number of bases seen for diagnostic purposes. + * Count the number of bases in a set of reads * *Input
*@@ -45,13 +45,14 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; * *
Output
*- * Number of bases seen. + * Number of bases seen. If an output file name is provided, then the result will be written to that file. + * Otherwise it will be sent to standard console output. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ + * -R reference.fasta \ * -T CountBases \ * -I input.bam \ * [-L input.intervals] diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java index 096ce70dc..443196cff 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java @@ -45,7 +45,7 @@ import java.util.Collections; import java.util.List; /** - * Count contiguous regions in an interval list. + * Count contiguous regions in an interval list * *When the GATK reads in intervals from an intervals list, any intervals that overlap each other get merged into * a single interval spanning the original ones. For example, if you have the following intervals: @@ -63,7 +63,7 @@ import java.util.List; * *
Input
*- * One or more rod files containing intervals to check. + * One or more ROD files containing intervals to check. *
* *Output
@@ -73,12 +73,12 @@ import java.util.List; * * You can use the -numOverlaps argument to find out how many cases you have of a specific number of overlaps. * - *Example
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T CountIntervals \ - * -R ref.fasta \ - * -0 output.txt \ + * -R reference.fasta \ + * -o output.txt \ * -check intervals.list **/ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java index 5987199b1..5a0ec3370 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java @@ -39,10 +39,10 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.io.PrintStream; /** - * Walks over the input data set, calculating the total number of covered loci for diagnostic purposes. + * Count the total number of covered loci * *- * This is the simplest example of a locus walker. + * This tool counts the number of loci (positions in the reference) that are covered by sequence data. *
* *Input
@@ -56,11 +56,11 @@ import java.io.PrintStream; * Otherwise it will be sent to standard console output. * * - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T CountLoci \ - * -R ref.fasta \ + * -R reference.fasta \ * -I input.bam \ * -o output.txt \ * [-L input.intervals] diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java index 293cfd0ed..55424f67b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java @@ -41,7 +41,9 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.io.PrintStream; /** - * Walks over the input data set, calculating the number of reads seen from male samples for diagnostic purposes. + * Count the number of reads seen from male samples + * + *This tool counts the number of sequence reads seen from samples that are male according to the sample metadata.
* *Input
*@@ -50,14 +52,15 @@ import java.io.PrintStream; * *
Output
*- * Number of reads seen from male samples. + * Number of reads seen from male samples. If an output file name is provided, then the result will be written to that file. + * Otherwise it will be sent to standard console output. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T CountMales \ - * -R ref.fasta \ + * -R reference.fasta \ * -I samples.bam \ * -o output.txt *diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java index 3e9e9db39..c81f7b9ac 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java @@ -51,9 +51,9 @@ import java.io.PrintStream; import java.util.*; /** - * Prints out counts of the number of reference ordered data objects encountered. + * Count the number of ROD objects encountered * - *CountRods is a RODWalker, and so traverses the data by ROD. For example if the ROD passed to it is a VCF file, + *
CountRods is a RODWalker, and so traverses the data by ROD (reference ordered data). For example if the ROD passed to it is a VCF file, * it will count the variants in the file.
* *Note that this tool is different from CountRodsByRef which is a RefWalker, and so traverses the data by @@ -66,19 +66,19 @@ import java.util.*; * *
Input
*- * One or more rod files. + * One or more ROD files. *
* *Output
*- * Number of rods seen. + * Number of RODs seen. *
* - *Example
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T CountRODs \ - * -R ref.fasta \ + * -R reference.fasta \ * -o output.txt \ * --rod input.vcf *diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java index 8161d4387..c359bf1c3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java @@ -43,7 +43,7 @@ import java.util.Collections; import java.util.List; /** - * Prints out counts of the number of reference ordered data objects encountered along the reference. + * Count the number of ROD objects encountered along the reference * *CountRodsByRef is a RefWalker, and so traverses the data by position along the reference. It counts ROD * elements (such as, but not limited to, variants) found at each position or within specific intervals if you use @@ -58,19 +58,19 @@ import java.util.List; * *
Input
*- * One or more rod files. + * One or more ROD files. *
* *Output
*- * Number of rods seen. + * Number of RODs seen. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T CountRODsByRef \ - * -R ref.fasta \ + * -R reference.fasta \ * -o output.txt \ * --rod input.vcf *diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java index ccb714b45..d5f424b6f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java @@ -45,7 +45,9 @@ import java.util.HashMap; import java.util.Map; /** - * Walks over the input data set, counting the number of read events (from the CIGAR operator) + * Count the number of read events + * + *This tool counts the number of "events" (I, D, M etc) encountered in the CIGAR strings of the sequence reads.
* *Input
*@@ -55,12 +57,13 @@ import java.util.Map; *
Output
** Number of read events for each category, formatted as a GATKReport table. + *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T CountReadEvents \ - * -R ref.fasta \ + * -R reference.fasta \ * -I input.bam \ * -o output.grp \ * [-L input.intervals] diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java index 6503766b6..369a5878e 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java @@ -37,11 +37,12 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; /** - * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * Count the number of reads * *- * Can also count the number of reads matching a given criterion using read filters (see the - * --read-filter command line argument). Simplest example of a read-backed analysis. + * This is especially useful in combination with read filters (see the --read-filter command line argument) which + * allow you to count reads matching specific criteria (e.g. read group tags or quality parameters). + *
* * *Input
@@ -51,13 +52,13 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; * *Output
*- * Number of reads seen. + * Number of reads seen. This is output to the terminal/stdout. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ + * -R reference.fasta \ * -T CountReads \ * -I input.bam \ * [-L input.intervals] diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java index 10094ac6a..b569a0a6c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java @@ -44,7 +44,9 @@ import java.io.PrintStream; import java.util.List; /** - * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips + * Count the number of reads ending in insertions, deletions or soft-clips + * + ** * - *This tool reports the number of reads where the end bases do not map to the reference sequence.
* *Input
*@@ -56,13 +58,13 @@ import java.util.List; * Number of reads ending in each category. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T CountTerminusEvent \ - * -o output.txt \ + * -R reference.fasta \ * -I input.bam \ + * -o output.txt \ * [-L input.intervals] **/ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java index 7bd51249a..57cf4d59a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java @@ -42,9 +42,9 @@ import java.text.DecimalFormat; import java.text.NumberFormat; /** - * A reimplementation of the 'samtools flagstat' subcommand in the GATK + * Collect statistics about sequence reads based on their SAM flags * - *This tool walks over all input data, accumulating statistics such as total number of reads, + *
This tool emulates the behavior of 'samtools flagstat'. It collects statistics such as total number of reads, * reads with QC failure flag set, number of duplicates, percentage mapped, etc.
* *Input
@@ -57,11 +57,11 @@ import java.text.NumberFormat; * Resulting stats are written to file if an output file name is given (with -o), otherwise output to stdout. * * - *Example
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T FlagStat \ - * -R ref.fasta \ + * -R reference.fasta \ * -I reads.bam \ * [-o output.txt] *diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java index 8b59812bb..db6199951 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java @@ -46,9 +46,10 @@ import java.util.Collections; import java.util.List; /** - * Emulates the samtools pileup command to print aligned reads + * Print read alignments in Pileup-style format * - *Prints the alignment in something similar to the Samtools pileup format (see the + *
This tool emulates the 'samtools pileup' command. It prints the alignment in a format that is very similar to + * the Samtools pileup format (see the * Pileup format documentation for more details about * the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference * base, read bases, and read qualities. In addition to these default fields, additional information can be added to @@ -58,7 +59,6 @@ import java.util.List; *
* samtools pileup -f in.ref.fasta -l in.site_list input.bam *- * *Input
*@@ -70,12 +70,12 @@ import java.util.List; * Alignment of reads formatted in the Pileup style. *
* - *Example
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ * -T Pileup \ - * -R exampleFASTA.fasta \ - * -I exampleBAM.bam \ + * -R reference.fasta \ + * -I my_reads.bam \ * -L chr1:257-267 * -o output.txt *diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java index ca9a76ab8..9c71f0934 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java @@ -40,8 +40,11 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.io.PrintStream; /** - * Prints out all of the RODs in the input data set. Data is rendered using the toString() method - * of the given ROD. + * Print out all of the RODs in the input data set + * + *This tool reports what RODs (reference ordered data sets) are contained in a given input.
+ * + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class PrintRODs extends RodWalker{ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java index 0b325e6a4..0a0b9b6cd 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java @@ -51,14 +51,15 @@ import java.io.PrintStream; * * Output
*- * If ok, nothing, else will throw an exception at the site where there's been a problem + * If the reference is fully valid, the run will complete successfully. If not, an error message will be produced + * at the site where the program encountered a problem. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ - * -T QCRef + * java -jar GenomeAnalysisTK.jar \ + * -T QCRef \ + * -R reference.fasta ** */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java index 23ea65b9d..a2c3f796e 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java @@ -48,23 +48,21 @@ import java.io.PrintStream; import java.util.Arrays; /** - * Read clipping statistics for all reads. + * Collect read clipping statistics * - * Walks over the input reads, printing out statistics about the read length, number of clipping events, and length - * of the clipping to the output stream. - * - * Note: Ignores N's in the Cigar string. + *This tool collects statistics about the read length, number of clipping events, and length + * of the clipping in all reads in the dataset.
* *Input
- * One or more BAM files + * One or more BAM files. * *Output
- * A simple tabulated text file with read length and clipping statistics for every read (or every N reads if the "skip" - * option is used) + * A simple tabulated text file with read length and clipping statistics for every read (or every given number of reads + * if the "skip" option is used). + * + *Caveat
+ *This tool ignores "N" events in the CIGAR string.
* - * User: depristo - * Date: May 5, 2010 - * Time: 12:16:41 PM */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java index 1bbc3a2d6..cafaa82c5 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java @@ -111,7 +111,7 @@ import java.util.regex.Pattern; *Example
+ *Usage example
** java -jar GenomeAnalysisTK.jar \ * -T ClipReads \ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java index 008a14842..2f609facf 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java @@ -52,17 +52,18 @@ import java.io.File; import java.util.*; /** - * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. + * Write out sequence read data (for filtering, merging, subsetting etc) * *- * PrintReads can dynamically merge the contents of multiple input BAM files, resulting - * in merged output sorted in coordinate order. Can also optionally filter reads based on the - * --read_filter command line argument. + * PrintReads is a generic utility tool for manipulating sequencing data in SAM/BAM format. It can dynamically + * merge the contents of multiple input BAM files, resulting in merged output sorted in coordinate order. It can + * also optionally filter reads based on various read properties such as read group tags using the `--read_filter/-rf` + * command line argument (see documentation on read filters for more information). *
* ** Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow, - * it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below. + * it takes the `--BQSR` engine argument, which is listed under Inherited Arguments > CommandLineGATK below. *
* *Input
@@ -75,30 +76,31 @@ import java.util.*; * A single processed bam file. * * - *Examples
+ *Usage examples
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + // Prints all reads that have a mapping quality above zero + * java -jar GenomeAnalysisTK.jar \ * -T PrintReads \ - * -o output.bam \ + * -R reference.fasta \ * -I input1.bam \ * -I input2.bam \ + * -o output.bam \ * --read_filter MappingQualityZero * * // Prints the first 2000 reads in the BAM file - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T PrintReads \ - * -o output.bam \ + * -R reference.fasta \ * -I input.bam \ + * -o output.bam \ * -n 2000 * * // Downsamples BAM file to 25% - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T PrintReads \ - * -o output.bam \ + * -R reference.fasta \ * -I input.bam \ + * -o output.bam \ * -dfrac 0.25 ** @@ -142,11 +144,11 @@ public class PrintReads extends ReadWalkerimpleme /** * Erase all extra attributes in the read but keep the read group information */ - @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false) + @Argument(fullName="simplify", shortName="s", doc="Simplify all reads", required=false) public boolean simplifyReads = false; @Hidden - @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Don't output a program tag", required = false) public boolean NO_PG_TAG = false; List readTransformers = Collections.emptyList(); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java index 7e05a10c4..0e23fac95 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java @@ -50,47 +50,49 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** - * Utility tool to blindly strip base adaptors. Main application is for FASTQ/unaligned BAM pre-processing where libraries - * have very short inserts, and hence a substantial part of the sequencing data will have adaptor sequence present. - * - * By design, tool will only work for Illumina-like library constructs, where the typical library architecture is: - * [Adaptor 1]-[Genomic Insert]-[Adaptor 2 (index/barcode)] - *
- * It is assumed that when data is paired, one read will span the forward strand and one read will span the reverse strand. - * Hence, when specifying adaptors they should be specified as both forward and reverse-complement to make sure they're removed in all cases. + * Utility tool to blindly strip base adaptors + * + *
This tool is mainly intended to be applied to FASTQ/unaligned BAM pre-processing where libraries + * have very short inserts, and hence a substantial part of the sequencing data will have adaptor sequence present. By + * design, tool will only work for Illumina-like library constructs, where the typical library architecture is: + * [Adaptor 1]-[Genomic Insert]-[Adaptor 2 (index/barcode)]
+ * + *We assume that when data is paired, one read will span the forward strand and one read will span the reverse strand. + * Hence, adaptors should be specified as both forward and reverse-complement to ensure they are removed in all cases. * By design, as well, "circular" constructions where a read can have an insert, then adaptor, then more genomic insert, are not supported. * When an adaptor is detected, all bases downstream from it (i.e. in the 3' direction) will be removed. * Adaptor detection is carried out by looking for overlaps between forward and reverse reads in a pair. * If a sufficiently high overlap is found, the insert size is computed and if insert size < read lengths adaptor bases are removed from reads. + *
* - * Advantages over ReadClipper: - * - No previous knowledge of adaptors or library structure is necessary + *Advantage over ReadClipper: No previous knowledge of adaptors or library structure is necessary.
* - * Advantages over 3rd party tools like SeqPrep: - * - Can do BAM streaming instead of having to convert to fastq - * - No need to merge reads - merging reads can have some advantages, but complicates downstream processing and loses information that can be used, - * e.g. in variant calling - *+ *
Advantages over 3rd party tools like SeqPrep:
+ *+ *
* - *- Can do BAM streaming instead of having to convert to fastq
+ *- No need to merge reads; merging reads can have some advantages, but complicates downstream processing and loses information that can be used, + * e.g. in variant calling
+ *Input
+ *Input
** The input read data in BAM format. Read data MUST be in query name ordering as produced, for example with Picard's FastqToBam + *
* - *Output
+ *Output
** A merged BAM file with unaligned reads *
* - *Examples
+ ** - * java -Xmx4g -jar GenomeAnalysisTK.jar \ + * java -jar GenomeAnalysisTK.jar \ + * -R reference.fasta \ * -T ReadAdaptorTrimmer \ * -I my_reads.bam \ - * -R resources/Homo_sapiens_assembly18.fasta \ * -o trimmed_Reads.bam **/ - @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.READ) public class ReadAdaptorTrimmer extends ReadWalker, SAMFileWriter> implements NanoSchedulable { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java index 500ce20d2..b015e6dc8 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java @@ -51,8 +51,29 @@ import java.util.List; import java.util.Map; /** - * Divides the input data set into separate BAM files, one for each sample in the input data set. The split - * files are named concatenating the sample name to the end of the provided outputRoot command-line argument. + * Split a BAM file by sample + * + *
This tool divides the input data set into separate BAM files, one for each sample in the input data set. The split + * files are named by concatenating the sample name to the end of the provided outputRoot command-line argument.
+ * + *Input
+ *+ * A single bam file. + *
+ * + *Output
+ *+ * A separate bam file for each sample. + *
+ * + *Usage example
+ *+ * java -jar GenomeAnalysisTK.jar \ + * -T SplitSamFile \ + * -R reference.fasta \ + * -I input.bam \ + * --outputRoot myproject_ + **/ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @WalkerName("SplitSamFile") diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java index d47170514..9812d1001 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.tools.walkers.rnaseq; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter; import org.broadinstitute.gatk.engine.walkers.DisabledReadFilters; import org.broadinstitute.gatk.engine.walkers.Downsample; @@ -36,6 +37,8 @@ import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.downsampling.DownsampleType; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; @@ -44,33 +47,21 @@ import java.io.PrintStream; import java.util.List; /** - * Calculate allele counts for allele-specific expression analysis + * Calculate read counts per allele for allele-specific expression analysis * *- * This tool calculates allele counts at a set of given loci after applying filters that are tuned for enabling + * This tool calculates allele counts at a set of positions after applying filters that are tuned for enabling * allele-specific expression (ASE) analysis. The filters operate on mapping quality, base quality, depth of coverage, * overlapping paired reads and deletions overlapping the position. All thresholds and options are controlled by * command-line arguments. *
* - *Notes
- *- *
- *- Like most GATK tools, this tools filters out duplicate reads by default. However, some ASE methods - * recommend including duplicate reads in the analysis, so the DuplicateReads filter can be disabled using the - * `-drf DuplicateReads` flag in the command-line.
- *Caveats
- *- *
*- This tool will only process biallelic sites. If your callset contains multiallelic sites, they will be ignored. - * Optionally, you can subset your callset to just biallelic variants using e.g. - * SelectVariants - * with the option `-restrictAllelesTo BIALLELIC`.
- *Input
**
+ * *- BAM files (with proper headers) to be analyzed for ASE
*- A VCF file with specific sites to process.
- * + *Output
** A table of allele counts at the given sites. By default, it is formatted as a tab-delimited text file @@ -78,12 +69,12 @@ import java.util.List; * a downstream tool developed for allele-specific expression analysis. *
* - *Examples
+ *Usage example
** java -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * -R reference.fasta \ * -T ASEReadCounter \ - * -o file_name \ + * -o file_name.csv \ * -I input.bam \ * -sites sites.vcf \ * -U ALLOW_N_CIGAR_READS \ @@ -92,7 +83,23 @@ import java.util.List; * [--minBaseQuality 2] \ * [-drf DuplicateRead] *+ * + *Note
+ *+ *
+ *- Like most GATK tools, this tools filters out duplicate reads by default. However, some ASE methods + * recommend including duplicate reads in the analysis, so the DuplicateReads filter can be disabled using the + * "-drf DuplicateReads" flag in the command-line.
+ *Caveat
+ *+ *
+ * */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 10000) //@DisabledReadFilters({DuplicateReadFilter.class}) //currently can be disabled using the command line argument -drf DuplicateRead public class ASEReadCounter extends LocusWalker- This tool will only process biallelic sites. If your callset contains multiallelic sites, they will be ignored. + * Optionally, you can subset your callset to just biallelic variants using e.g. + * SelectVariants + * with the option "-restrictAllelesTo BIALLELIC".
+ *{ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java index 38027af47..dea80d112 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java @@ -76,6 +76,7 @@ import java.util.*; * degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation * and stratification modules. + * * * Input
*@@ -86,8 +87,9 @@ import java.util.*; *
* Evaluation tables detailing the results of the eval modules which were applied. * For example: + *
*- * output.eval.gatkreport: + * output.eval.grp: * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample * CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ... * CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ... @@ -103,12 +105,12 @@ import java.util.*; ** * - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T VariantEval \ - * -o output.eval.gatkreport \ + * -R reference.fasta \ + * -o output.eval.grp \ * --eval:set1 set1.vcf \ * --eval:set2 set2.vcf \ * [--comp comp.vcf] @@ -116,9 +118,11 @@ import java.util.*; * *Caveat
* - *Some stratifications and evaluators are incompatible with each other due to their respective memory requirements, such as AlleleCount and VariantSummary, or Sample and VariantSummary. - * If you specify such a combination, the program will output an error message and ask you to disable one of these options. - * We do not currently provide an exhaustive list of incompatible combinations, so we recommend trying out combinations that you are interested in on a dummy command line, to rapidly ascertain whether it will work or not.
+ *Some stratifications and evaluators are incompatible with each other due to their respective memory requirements, + * such as AlleleCount and VariantSummary, or Sample and VariantSummary. If you specify such a combination, the program + * will output an error message and ask you to disable one of these options. We do not currently provide an exhaustive + * list of incompatible combinations, so we recommend trying out combinations that you are interested in on a dummy + * command line, to rapidly ascertain whether it will work or not.
* */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java index 51f0c40bc..b538225ef 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java @@ -89,23 +89,24 @@ import java.util.*; * A combined VCF. * * - *Examples
+ *Usage examples
* *Merge two separate callsets
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T CombineVariants \ + * -R reference.fasta \ * --variant input1.vcf \ * --variant input2.vcf \ * -o output.vcf \ * -genotypeMergeOptions UNIQUIFY *+ * *Get the union of calls made on the same samples
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T CombineVariants \ + * -R reference.fasta \ * --variant:foo input1.vcf \ * --variant:bar input2.vcf \ * -o output.vcf \ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java index 6e8a9e7e9..d04e14ceb 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java @@ -47,13 +47,36 @@ import htsjdk.variant.variantcontext.VariantContext; import java.util.*; /** - * Filters a lifted-over VCF file for ref bases that have been changed. + * Filters a lifted-over VCF file for reference bases that have been changed * - * "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process adjusts the position of the call to match the corresponding position on the target reference. - * For example, if you have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on the b37 reference, you need to liftover one of the callsets to the other reference. + *- + * + *"Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process + * adjusts the position of the call to match the corresponding position on the target reference. For example, if you + * have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on + * the b37 reference, you need to liftover one of the callsets to the other reference.
* - * FilteredLiftedVariants is intended to be the second of two processing steps for the liftover process. The first step is to run LiftoverVariants on your VCF file. - * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced. + *This tool is intended to be the second of two processing steps for the liftover process. The first step is to + * run LiftoverVariants on your VCF file. The second step is to run FilterLiftedVariants on the output of + * LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the + * header have all been correctly replaced.
+ * + *Input
+ *+ * A lifted-over variant call set to filter. + *
+ * + *Output
+ *+ * The filtered call set. + *
+ * + *Usage example
+ *+ * java -jar GenomeAnalysisTK.jar \ + * -T FilterLiftedVariants \ + * -R reference.fasta \ + * -V liftedover_input.vcf \ + * -o filtered_output.vcf + ** */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java index a9e578058..5597a4c67 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java @@ -44,10 +44,10 @@ import java.io.PrintStream; import java.util.*; /** - * Genotype concordance (per-sample and aggregate counts and frequencies, NRD/NRS and site allele overlaps) between two callsets + * Genotype concordance between two callsets * *- * GenotypeConcordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, + * This tool takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, * and for each sample, the genotype-by-genotype counts (e.g. the number of sites at which a sample was * called homozygous-reference in the EVAL callset, but homozygous-variant in the COMP callset). It outputs these * counts as well as convenient proportions (such as the proportion of het calls in the EVAL which were called REF in @@ -192,7 +192,17 @@ import java.util.*; * NA12891 NO_CALL_HOM_VAR 0.000 * (...) *
Usage example
+ *+ * java -jar GenomeAnalysisTK.jar \ + * -T GenotypeConcordance \ + * -R reference.fasta \ + * -eval test_set.vcf \ + * -comp truth_set.vcf \ + * -o output.grp + *+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java index b80c1c4d9..099293cc2 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -57,18 +57,18 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; import java.util.*; /** - * Left-aligns indels from a variants file. + * Left-align indels in a variant callset * *
* LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. - * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation. + * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation.
* *Input
*- * A variant set to left-align and trim. + * A variant call set to left-align and trim. *
* *Output
@@ -76,11 +76,11 @@ import java.util.*; * A left-aligned VCF. * * - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T LeftAlignAndTrimVariants \ + * -R reference.fasta \ * --variant input.vcf \ * -o output.vcf *diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java index f66daf254..15981d19a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java @@ -58,16 +58,43 @@ import java.io.File; import java.util.*; /** - * Lifts a VCF file over from one build to another. + * Lifts a VCF file over from one build to another * - * "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process adjusts the position of the call to match the corresponding position on the target reference. - * For example, if you have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on the b37 reference, you need to liftover one of the callsets to the other reference. + *"Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process + * adjusts the position of the call to match the corresponding position on the target reference. For example, if you + * have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on + * the b37 reference, you need to liftover one of the callsets to the other reference.
* - * LiftoverVariants is intended to be the first of two processing steps for the liftover process. - * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced. + *LiftoverVariants is intended to be the first of two processing steps for the liftover process. + * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid + * well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced.
+ * + *Caveat
+ *To be clear, the VCF resulting from the LiftoverVariants run is not guaranteed to be valid according to the official specification. The file could + * possibly be mis-sorted and the header may not be complete. That is why you need to run FilterLiftedVariants on it.
+ * + *Input
+ *+ * A variant call set to lift over, the sequence dictionary of the new reference build and the appropriate liftover + * chain file. + *
+ * + *Output
+ *+ * The lifted-over call set. + *
+ * + *Usage example
+ *+ * java -jar GenomeAnalysisTK.jar \ + * -T LiftoverVariants \ + * -R reference_hg19.fasta \ + * -V input_hg19.vcf \ + * -chain liftover_hg19_to_b37.txt \ + * -dict reference_b37.dict \ + * -o liftedover_output_b37.vcf + ** - * To be clear, the VCF resulting from the LiftoverVariants run is not guaranteed to be valid according to the official specification. The file could - * possibly be mis-sorted and the header may not be complete. That is why you need to run FilterLiftedVariants on it. */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class LiftoverVariants extends RodWalker{ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java index b14d2f5b3..7b08bef53 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java @@ -50,7 +50,33 @@ import java.io.File; import java.util.*; /** - * Takes a VCF file, randomly splits variants into two different sets, and outputs 2 new VCFs with the results. + * Randomly split variants into different sets + * + * This tool takes a VCF file, randomly splits variants into different sets, and writes the + * results to separate files. By default the tool splits the input into two new sets, but it can be made to output + * more than two separate call sets.
+ * + *Input
+ *+ * A variant call set to split. + *
+ * + *Output
+ *+ * The new callsets. + *
+ * + *Usage example
+ *+ * java -jar GenomeAnalysisTK.jar \ + * -T RandomlySplitVariants \ + * -R reference.fasta \ + * -V input.vcf \ + * -o1 output_1.vcf \ + * -o2 output_2.vcf + *+ * + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class RandomlySplitVariants extends RodWalker{ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java index 75f297c10..a55c2215e 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java @@ -51,8 +51,8 @@ import java.io.File; import java.util.*; /** - * Selects headers from a VCF source. - * + * Selects headers from a VCF source + * * * Often, a VCF containing many headers will need to be subset in order to facilitate certain formatting guidelines. * SelectHeaders can be used for this purpose. Given a single VCF file, one or more headers can be extracted from the @@ -65,44 +65,49 @@ import java.util.*; *
*Output
*- * A header selected VCF. + * A VCF with the selected headers. *
- * - *Examples
+ * + *Usage examples
+ *Select only the FILTER, FORMAT, and INFO headers
*- * Select only the FILTER, FORMAT, and INFO headers: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T SelectHeaders \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -hn FILTER \ * -hn FORMAT \ * -hn INFO + ** - * Select only the FILTER, FORMAT, and INFO headers and add in the reference file names: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select only the FILTER, FORMAT, and INFO headers and add in the reference file names
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectHeaders \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -hn FILTER \ * -hn FORMAT \ * -hn INFO \ * -irn \ * -iln + ** - * Select only the FILTER, FORMAT, and INFO headers, plus any headers with SnpEff: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select only the FILTER, FORMAT, and INFO headers, plus any headers with "SnpEff"
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectHeaders \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -hn FILTER \ * -hn FORMAT \ * -hn INFO \ * -he '.*SnpEff.*' *+ * */ @SuppressWarnings("unused") @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 7658f042c..9b9738164 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -86,121 +86,141 @@ import java.util.*; * *Output
*- * The name of the VCF file to which to write the selected subset of variants. + * A new VCF file containing the selected subset of variants. *
* - *Examples
+ *Usage examples
+ *Select two samples out of a VCF with many samples
*- * Select two samples out of a VCF with many samples: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -sn SAMPLE_A_PARC \ * -sn SAMPLE_B_ACTG + ** - * Select two samples and any sample that matches a regular expression: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select two samples and any sample that matches a regular expression
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -sn SAMPLE_1_PARC \ * -sn SAMPLE_1_ACTG \ * -se 'SAMPLE.+PARC' + ** - * Select any sample that matches a regular expression and sites where the QD annotation is more than 10: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select any sample that matches a regular expression and sites where the QD annotation is more than 10
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -se 'SAMPLE.+PARC' \ * -select "QD > 10.0" + ** - * Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default): - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default)
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -sn SAMPLE_1_ACTG \ * -env \ * -ef + ** - * Select a sample, subset remaining alleles, but don't trim: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select a sample, subset remaining alleles, but don't trim
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -sn SAMPLE_1_ACTG \ * -env \ * -noTrim + ** - * Select a sample and restrict the output vcf to a set of intervals: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select a sample and restrict the output vcf to a set of intervals
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -L /path/to/my.interval_list \ * -sn SAMPLE_1_ACTG + ** - * Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset): - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset)
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant hapmap.vcf \ + * -R reference.fasta \ + * -V hapmap.vcf \ * --discordance myCalls.vcf \ * -o output.vcf \ * -sn mySample + ** - * Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers): - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers)
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant myCalls.vcf \ - * --concordance hisCalls.vcf \ + * -R reference.fasta \ + * -V myCalls.vcf \ + * --concordance theirCalls.vcf \ * -o output.vcf \ * -sn mySample + ** - * Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -ped family.ped \ - * -mv \ - * -mvq 50 \ + * -mv -mvq 50 \ * -o violations.vcf + ** - * Creating a set with 50% of the total number of variants in the variant VCF: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Create a set with 50% of the total number of variants in the variant VCF
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -fraction 0.5 + ** - * Select only indels from a VCF: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select only indels from a VCF
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -selectType INDEL + ** - * Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column): - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column)
+ *+ * java -jar GenomeAnalysisTK.jar \ * -T SelectVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf \ * -selectType SNP -selectType MNP \ * -restrictAllelesTo MULTIALLELIC - * ** */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java index 6142bc08d..82a201091 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java @@ -49,10 +49,10 @@ import java.util.*; /** - * Validates a VCF file with an extra strict set of criteria. + * Validate a VCF file with an extra strict set of criteria * *- * ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it. + * This tool is designed to validate much of the information inside a VCF file. * In addition to standard adherence to the VCF specification, this tool performs extra strict validations to ensure * the information contained within the file is correct. These include: *
@@ -80,37 +80,33 @@ import java.util.*; * A variant set to validate using
* - *-Vor--variantas shown below. *Examples
- * - *To perform VCF format and all strict validations:
+ *Usage examples
* + *To perform VCF format tests and all strict validations
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T ValidateVariants \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * --dbsnp dbsnp.vcf ** - *To perform only VCF format tests:
- * + *To perform only VCF format tests
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T ValidateVariants \ - * --validationTypeToExclude ALL \ - * --variant input.vcf + * -R reference.fasta \ + * -V input.vcf \ + * --validationTypeToExclude ALL ** - *To perform all validations except the strict ALLELE validation:
- * + *To perform all validations except the strict ALLELE validation
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T ValidateVariants \ + * -R reference.fasta \ + * -V input.vcf \ * --validationTypeToExclude ALLELES - * --variant input.vcf \ - * --dbsnp dbsnp.vcf ** */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java index c52c408a2..307b78289 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java @@ -49,13 +49,13 @@ import htsjdk.variant.variantcontext.VariantContextBuilder; import java.util.*; /** - * Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes) + * Annotate a validation VCF with QC metrics * *- * The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes). + * This tool is intended for vetting/assessing validation data (containing genotypes). * The tool produces a VCF that is annotated with information pertaining to plate quality control and by * default is soft-filtered by high no-call rate or low Hardy-Weinberg probability. - * If you have .ped files, please first convert them to VCF format. + * If you have .ped files, please first convert them to VCF format.
* *Input
*@@ -65,6 +65,7 @@ import java.util.*; *
Output
** An annotated VCF. Additionally, a table like the following will be output: + *
** Total number of samples assayed: 185 * Total number of records processed: 152 @@ -74,14 +75,13 @@ import java.util.*; * Number of records passing all filters: 106 (69%) * Number of passing records that are polymorphic: 98 (92%) *- * * - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T VariantValidationAssessor \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf ** diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java index b9954221f..0873d5b94 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java @@ -48,35 +48,37 @@ import htsjdk.variant.vcf.VCFHeaderLine; import java.util.*; /** - * Takes alleles from a variants file and breaks them up (if possible) into more basic/primitive alleles. + * Simplify multi-nucleotide variants (MNPs) into more basic/primitive alleles. * - *- * For now this tool modifies only multi-nucleotide polymorphisms (MNPs) and leaves SNPs, indels, and complex substitutions as is, - * although one day it may be extended to handle the complex substitution case. + *
This tool will take an MNP (e.g. ACCCA -> TCCCG) and break it up into separate records for each component + * part (A-T and A->G).
* - * This tool will take an MNP (e.g. ACCCA -> TCCCG) and break it up into separate records for each component part (A-T and A->G). - * - * Note that this tool modifies only bi-allelic variants. - * - *Input
+ *Input
** A variant set with any type of alleles. *
* - *Output
+ *Output
** A VCF with alleles broken into primitive types. *
* - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T VariantsToAllelicPrimitives \ - * --variant input.vcf \ + * -R reference.fasta \ + * -V input.vcf \ * -o output.vcf ** + *Caveats
+ *+ *
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class VariantsToAllelicPrimitives extends RodWalker- For now this tool modifies only multi-nucleotide polymorphisms (MNPs) and leaves SNPs, indels, and + * complex substitutions as is, although one day it may be extended to handle the complex substitution case.
+ *- This tool modifies only bi-allelic variants.
+ *{ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java index d7090235d..b57f187b9 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java @@ -50,7 +50,32 @@ import java.io.*; import java.util.*; /** - * Converts a VCF file to a binary plink Ped file (.bed/.bim/.fam) + * Convert VCF to binary pedigree file + * + * This tool takes a VCF and produces a binary pedigree as used by + * PLINK, consisting of three associated files (.bed/.bim/.fam).
+ * + *Inputs
+ *+ * A VCF file and a metadata file + *
+ * + *Outputs
+ *+ * A binary pedigree in PLINK format, composed of three files (.bed/.bim/.fam) + *
+ * + *Example
+ *+ * java -jar GenomeAnalysisTK.jar \ + * -T VariantsToBinaryPed \ + * -R reference.fasta \ + * -V variants.vcf \ + * -m metadata.fam \ + * -bed output.bed \ + * -bim output.bim \ + * -fam output.fam + **/ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=100)) @@ -62,37 +87,35 @@ public class VariantsToBinaryPed extends RodWalker{ protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); /** - * The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This - * is what Plink describes as a fam file. An example fam file is (note that there is no header): - * - * CEUTrio NA12878 NA12891 NA12892 2 -9
- * CEUTrio NA12891 UNKN1 UNKN2 2 -9
- * CEUTrio NA12892 UNKN3 UNKN4 1 -9
- *
- * where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex) + *The metaData file can take two formats, the first of which is the first 6 lines of the standard pedigree file. This + * is what Plink describes as a .fam file. An example .fam file is as follows (note that there is no header):
+ *+ * CEUTrio NA12878 NA12891 NA12892 2 -9 + * CEUTrio NA12891 UNKN1 UNKN2 2 -9 + * CEUTrio NA12892 UNKN3 UNKN4 1 -9 + *+ *where the entries are: FamilyID IndividualID DadID MomID Phenotype Sex.
+ *An alternate format is a two-column key-value file:
+ *+ * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9 + * NA12891 fid=CEUTrio;sex=2;phenotype=-9 + * NA12892 fid=CEUTrio;sex=1;phenotype=-9 + *+ *where unknown parents do not need to be specified. The columns are the individual ID and a list of key-value pairs.
*- * An alternate format is a two-column key-value file - *
- * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9
- * NA12891 fid=CEUTrio;sex=2;phenotype=-9
- * NA12892 fid=CEUTrio;sex=1;phenotype=-9
- *
- * wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs. - *
- * Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the - * command line has "-md [name].fam", the fam file will be subset and reordered to match the sample content and ordering - * of the VCF. However, if a metadata file of the alternate format is passed by "-md [name].txt", the walker will + * Regardless of which file is specified, the tool will output a .fam file alongside the pedigree file. If the + * command line has "-m [name].fam", the fam file will be subset and reordered to match the sample content and ordering + * of the VCF. However, if a metadata file of the alternate format is passed by "-m [name].txt", the tool will * construct a formatted .fam file from the data. *
*/ - @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + - "(in which case it will be copied to the file you provide as fam output).") + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file") File metaDataFile; @Input(shortName="mode",fullName="outputMode",required=false,doc="The output file mode (SNP major or individual major)") OutputMode mode = OutputMode.INDIVIDUAL_MAJOR; - @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") + @Output(shortName="bed",fullName = "bed",required=true,doc="output bed file") PrintStream outBed; @Output(shortName="bim",fullName="bim",required=true,doc="output map file") @@ -208,8 +231,8 @@ public class VariantsToBinaryPed extends RodWalker{ try { validateVariantSite(vc,ref,context); } catch (TribbleException e) { - throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+ - "Please run ValidateVariants for more detailed information. This error is: "+e.getMessage()); + throw new UserException("Input VCF file is invalid. "+ + "Please run ValidateVariants for more detailed information. The error is: "+e.getMessage()); } String refOut; @@ -461,7 +484,7 @@ public class VariantsToBinaryPed extends RodWalker { for ( String line : new XReadLines(metaDataFile) ) { String[] famSplit = line.split("\\s+"); if ( famSplit.length != 6 ) { - throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line); + throw new UserException("Line of the fam file is malformed. Expected 6 entries. Line is "+line); } String sid = famSplit[1]; String fid = famSplit[0]; @@ -501,7 +524,7 @@ public class VariantsToBinaryPed extends RodWalker { private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) { final Allele reportedRefAllele = vc.getReference(); final int refLength = reportedRefAllele.length(); - if ( refLength > 100 ) { + if ( refLength > 100 ) { //TODO: get rid of this hardcoded limit? logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart())); return; } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java index 081403f35..bd228f323 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java @@ -48,11 +48,13 @@ import java.lang.reflect.Array; import java.util.*; /** - * Emits specific fields from a VCF file to a tab-deliminated table + * Extract specific fields from a VCF file to a tab-delimited table * * - * This walker accepts a single VCF file and writes out user-selected fields from the - * VCF as a header-containing, tab-deliminated file. The user specifies one or more + * This tool is designed to extract fields from the VCF to a table format that is more convenient to work with in + * downstream analyses.
+ * + *The user specifies one or more * fields to print with the -F NAME, each of which appears as a single column in * the output file, with a header named NAME, and the value of this field in the VCF * one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding @@ -62,9 +64,7 @@ import java.util.*; * genotypes), NO-CALL (count of no-call genotypes), TYPE (the type of event), VAR (count of * non-reference genotypes), NSAMPLES (number of samples), NCALLED (number of called samples), * GQ (from the genotype field; works only for a file with a single sample), and MULTI-ALLELIC - * (is the record from a multi-allelic site). Note that if a VCF record is missing a value, then the tool by - * default throws an error, but the special value NA can be emitted instead with - * appropriate tool arguments. + * (is the record from a multi-allelic site).
* * * @@ -81,7 +81,7 @@ import java.util.*; * A tab-delimited file containing the values of the requested fields in the VCF file * * - *Examples
+ *Usage example
** java -jar GenomeAnalysisTK.jar \ * -R reference.fasta @@ -89,15 +89,19 @@ import java.util.*; * -V file.vcf \ * -F CHROM -F POS -F ID -F QUAL -F AC \ * -o results.table - * - * would produce a file that looks like: - * + *+ *would produce a file that looks like:
+ ** CHROM POS ID QUAL AC * 1 10 . 50 1 * 1 20 rs10 99 10 * et cetera... ** + *Caveat
+ *If a VCF record is missing a value, then the tool by default throws an error, but the special value NA can + * be emitted instead if requested at the command line using --allowMissingData.
+ * * @author Mark DePristo * @since 2010 */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java index 2e5b9a7b7..f2386d088 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java @@ -58,14 +58,15 @@ import java.io.File; import java.util.*; /** - * Converts variants from other file formats to VCF format. + * Convert variants from other file formats to VCF format * *- * Note that there must be a Tribble feature/codec for the file format as well as an adaptor. + * Note that there must be a Tribble feature/codec available for the file format as well as an adaptor. + *
* *Input
*- * A variant file to filter. + * A variant file to convert. *
* *Output
@@ -73,14 +74,13 @@ import java.util.*; * A VCF file. * * - *Examples
+ *Usage example
*- * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + * java -jar GenomeAnalysisTK.jar \ * -T VariantsToVCF \ + * -R reference.fasta \ * -o output.vcf \ - * --variant:RawHapMap input.hapmap \ - * --dbsnp dbsnp.vcf + * --variant:RawHapMap input.hapmap ** */ diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java index 108eb102f..dc71d58ee 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java @@ -47,7 +47,7 @@ import java.util.List; * A generic engine for comparing tree-structured objects * *- * Compares two record-oriented files, itemizing specific difference between equivalent + * This tool compares two record-oriented files, itemizing specific difference between equivalent * records in the two files. Reports both itemized and summarized differences. *
* @@ -56,8 +56,8 @@ import java.util.List; ** The GATK contains a summarizing difference engine that compares hierarchical data structures to emit: *
- *
* * diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java index b72811c00..e84108973 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java @@ -28,7 +28,7 @@ package org.broadinstitute.gatk.utils.help; public class HelpConstants { public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; - public final static String GATK_DOCS_URL = BASE_GATK_URL + "/tooldocs/"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/guide/tooldocs/"; public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; diff --git a/settings/helpTemplates/generic.index.template.html b/settings/helpTemplates/generic.index.template.html index bd5742f36..794e50dc6 100644 --- a/settings/helpTemplates/generic.index.template.html +++ b/settings/helpTemplates/generic.index.template.html @@ -24,7 +24,7 @@ diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index d163eff5f..0141c8673 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -24,7 +24,7 @@ @@ -172,7 +172,7 @@ #if> <#if annotdescript?has_content > -- A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G. - *
- A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed. + *
- A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G.
+ *- A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed.
*Header info
#if> <#if extradocs?size != 0> -
+Header definition line
<#list annotdescript as line>- @@ -255,10 +255,11 @@
${line}Inherited arguments
-The arguments described in the entries below can be supplied to this tool to modify - its behavior. For example, the -L argument directs the GATK engine restricts processing - to specific genomic intervals (this is an Engine capability and is therefore available to all GATK walkers).
+Engine arguments
+All tools inherit arguments from the GATK Engine' "CommandLineGATK" argument collection, which can be + used to modify various aspects of the tool's function. For example, the -L argument directs the GATK + engine to restrict processing to specific genomic intervals; or the -rf argument allows you to apply + certain read filters to exclude some of the data from the analysis.
<#list extradocs as extradoc>
- ${extradoc.name}