diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java index 1b2129f3d..dcf7ed737 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java @@ -158,24 +158,28 @@ public class RecalibrationArgumentCollection implements Cloneable { /** * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off] */ + @Advanced @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false) public byte MISMATCHES_DEFAULT_QUALITY = -1; /** * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on] */ + @Advanced @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false) public byte INSERTIONS_DEFAULT_QUALITY = 45; /** * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on] */ + @Advanced @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) public byte DELETIONS_DEFAULT_QUALITY = 45; /** * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality */ + @Advanced @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) public byte LOW_QUAL_TAIL = 2; @@ -183,17 +187,19 @@ public class RecalibrationArgumentCollection implements Cloneable { * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. * This parameter tells BQSR the number of levels of quantization to use to build the quantization table. */ + @Advanced @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") public int QUANTIZING_LEVELS = 16; /** * The tag name for the binary tag covariate (if using it) */ + @Advanced @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") public String BINARY_TAG_NAME = null; - /* - * whether GATK report tables should have rows in sorted order, starting from leftmost column + /** + * Whether GATK report tables should have rows in sorted order, starting from leftmost column */ @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false) public Boolean SORT_BY_ALL_COLUMNS = false; @@ -219,7 +225,7 @@ public class RecalibrationArgumentCollection implements Cloneable { public PrintStream RECAL_TABLE_UPDATE_LOG = null; /** - * The repeat covariate will use a context of this size to calculate it's covariate value for base insertions and deletions + * The repeat covariate will use a context of this size to calculate its covariate value for base insertions and deletions */ @Hidden @Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java index 5fe0509cf..f66390fc1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java @@ -75,7 +75,7 @@ import java.util.List; /** * Allele count and frequency expectation per sample * - * Needs documentation + *

This annotation calculates the maximum likelihood (ML) number and frequency of alternate alleles for each individual sample at a site. In essence, it is equivalent to calculating the sum of "1"s in a genotype (for a biallelic site).

* */ @SuppressWarnings("unused") diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java index 8033b554d..7bdc365f1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java @@ -63,7 +63,7 @@ import java.util.*; /** - * Rank Sum Test of REF vs. ALT base quality scores + * Rank Sum Test of REF versus ALT base quality scores * *

This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

* diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java index a3034e658..3e70eea57 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java @@ -60,7 +60,7 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** - * Rank Sum Test for hard-clipped bases on REF vs. ALT reads + * Rank Sum Test for hard-clipped bases on REF versus ALT reads * *

This variant-level annotation tests whether the data supporting the reference allele shows more or less base clipping (hard clips) than those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have more hard-clipped bases than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have fewer hard-clipped bases than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.

* @@ -68,7 +68,7 @@ import java.util.*; *

The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

* *

Caveat

- *

The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

+ *

The clipping rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

* */ public class ClippingRankSumTest extends RankSumTest { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java index 8a0777245..f8404800e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java @@ -70,7 +70,7 @@ import java.util.List; import java.util.Map; /** - * Total depth of coverage per sample (in FORMAT) and over all samples (in INFO). + * Total depth of coverage per sample and over all samples. * *

This annotation is used to provide counts of read depth at two different levels, with some important differences. At the sample level (FORMAT), the DP value is the count of reads that passed the caller's internal quality control metrics (such as MAPQ > 17, for example). At the site level (INFO), the DP value is the unfiltered depth over all samples.

* @@ -78,7 +78,7 @@ import java.util.Map; * *

Caveats

* * *

Related annotations

diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java index 83c3837b3..7f01f56db 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java @@ -68,8 +68,17 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** - * Genotype summary statistics + * Summarize genotype statistics from all samples at the site level * + *

This annotation collects several genotype-level statistics from all samples and summarizes them in the INFO field. The following statistics are collected:

+ * + *

Note

*

These summaries can all be recomputed from the genotypes on the fly but it is a lot faster to add them here as INFO field annotations.

*/ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java index 7a9a123ed..458a1b696 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java @@ -82,7 +82,11 @@ import java.util.*; *

The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is a Phred-scaled p-value derived from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.

* *

Caveats

- *

Note that the Inbreeding Coefficient can only be calculated for cohorts containing at least 10 founder samples.

+ * * */ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java index e480bfccc..d18302e25 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java @@ -61,7 +61,7 @@ import java.util.Arrays; import java.util.List; /** - * Rank Sum Test of per-read likelihoods of REF vs. ALT reads + * Rank Sum Test of per-read likelihoods of REF versus ALT reads * *

This variant-level annotation compares the likelihoods of reads to their best haplotype match, between reads that support the reference allele and those that support the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower likelihoods to their best haplotype match than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher likelihoods to their best haplotype match than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.

* diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java index b957619f3..1cc87240b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java @@ -72,7 +72,7 @@ import java.util.*; /** * Likelihood of being a Mendelian Violation * - *

This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible mutation. The higher the output value, the more likely there is to be a Mendelian violation. Note that only positive values indicating likely MVs will be annotated; if the value for a given site is negative (indicating that there is no violation) the annotation is not written to the file.

+ *

This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible mutation. The higher the output value, the more likely there is to be a Mendelian violation. Note that only positive values indicating likely MVs will be annotated; if the value for a given site is negative (indicating that there is no violation) the annotation is not written to the file.

* *

Statistical notes

*

This annotation considers all possible combinations of all possible genotypes (homozygous-reference, heterozygous, and homozygous-variant) for each member of a trio, which amounts to 27 possible combinations. Using the Phred-scaled genotype likelihoods (PL values) from each individual, the likelihood of each combination is calculated, and the result contributes to the likelihood of the corresponding case (mendelian violation or non-violation) depending on which set it belongs to. See the method document on statistical tests for a more detailed explanation of this statistical test.

@@ -82,7 +82,7 @@ import java.util.*; *
  • The calculation assumes that the organism is diploid.
  • *
  • This annotation requires a valid pedigree file.
  • *
  • When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.
  • - *
  • This annotation can only be used from the Variant Annotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.
  • + *
  • This annotation can only be used from the VariantAnnotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.
  • * * *

    Related annotations

    diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java index 9eb8924f6..883c878b6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java @@ -62,7 +62,7 @@ import java.util.*; /** - * Rank Sum Test for mapping qualities of REF vs. ALT reads + * Rank Sum Test for mapping qualities of REF versus ALT reads * *

    This variant-level annotation compares the mapping qualities of the reads supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower mapping quality scores than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher mapping quality scores than those supporting the reference allele.

    *

    This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java index b3027a695..d9ae3289a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java @@ -78,6 +78,9 @@ import java.util.Map; * *

    This anotation gives you the count of all reads that have MAPQ = 0 across all samples. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.

    * + *

    Caveat

    + *

    This annotation is excluded by HaplotypeCaller because HC filters out all reads with MQ0 upfront, so the annotation would always return a value of 0 anyway.

    + * *

    Related annotations

    * * - *

    Example for recalibrating SNPs in exome data

    + *

    Usage example

    + *

    Recalibrating SNPs in exome data:

    *
      * java -Xmx4g -jar GenomeAnalysisTK.jar \
      *   -T VariantRecalibrator \
    - *   -R reference/human_g1k_v37.fasta \
    - *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
    + *   -R reference.fasta \
    + *   -input raw_variants.vcf \
      *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
      *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
      *   -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf
      *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
      *   -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an InbreedingCoeff \
      *   -mode SNP \
    - *   -recalFile path/to/output.recal \
    - *   -tranchesFile path/to/output.tranches \
    - *   -rscriptFile path/to/output.plots.R
    + *   -recalFile output.recal \
    + *   -tranchesFile output.tranches \
    + *   -rscriptFile output.plots.R
      * 
    * *

    Caveats

    diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriors.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriors.java index 421cb8386..15b0ed0e8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriors.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriors.java @@ -77,26 +77,20 @@ import htsjdk.variant.vcf.*; import java.util.*; /** - * Calculates genotype posterior likelihoods given panel data + * Calculate genotype posterior likelihoods given panel data * *

    * Given a VCF with genotype likelihoods from the HaplotypeCaller, UnifiedGenotyper, or another source which provides - * -unbiased- GLs, calculate the posterior genotype state and likelihood given allele frequency information from - * both the samples themselves and input VCFs describing allele frequencies in related populations. + * -unbiased- genotype likelihoods, calculate the posterior genotype state and likelihood given allele frequency + * information from both the samples themselves and input VCFs describing allele frequencies in related populations.

    * - * VCFs to use for informing the genotype likelihoods (e.g. a population-specific VCF from 1000 genomes) should have - * at least one of: - * - AC field and AN field - * - MLEAC field and AN field - * - genotypes - * - * The AF field will not be used in this calculation as it does not provide a way to estimate the confidence interval + *

    The AF field will not be used in this calculation as it does not provide a way to estimate the confidence interval * or uncertainty around the allele frequency, while AN provides this necessary information. This uncertainty is * modeled by a Dirichlet distribution: that is, the frequency is known up to a Dirichlet distribution with * parameters AC1+q,AC2+q,...,(AN-AC1-AC2-...)+q, where "q" is the global frequency prior (typically q << 1). The * genotype priors applied then follow a Dirichlet-Multinomial distribution, where 2 alleles per sample are drawn * independently. This assumption of independent draws is the assumption Hardy-Weinberg Equilibrium. Thus, HWE is - * imposed on the likelihoods as a result of CalculateGenotypePosteriors. + * imposed on the likelihoods as a result of CalculateGenotypePosteriors.

    * *

    Input

    *

    @@ -104,26 +98,28 @@ import java.util.*; *

  • A VCF with genotype likelihoods, and optionally genotypes, AC/AN fields, or MLEAC/AN fields
  • *
  • (Optional) A PED pedigree file containing the description of the individuals relationships.
  • * - * *

    * *

    * A collection of VCFs to use for informing allele frequency priors. Each VCF must have one of - * - AC field and AN field - * - MLEAC field and AN field - * - genotypes + *

    + * *

    * *

    Output

    - *

    - * A new VCF with: - * 1) Genotype posteriors added to the genotype fields ("PP") - * 2) Genotypes and GQ assigned according to these posteriors - * 3) Per-site genotype priors added to the INFO field ("PG") - * 4) (Optional) Per-site, per-trio joint likelihoods (JL) and joint posteriors (JL) given as Phred-scaled probability + *

    A new VCF with:

    + * * *

    Notes

    *

    @@ -135,51 +131,57 @@ import java.util.*; * the input callset. *

    * - *

    Examples

    + *

    Usage examples

    + *

    Inform the genotype assignment of NA12878 using the 1000G Euro panel

    *
    - * Inform the genotype assignment of NA12878 using the 1000G Euro panel
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
      *   -V NA12878.wgs.HC.vcf \
      *   -supporting 1000G_EUR.genotypes.combined.vcf \
      *   -o NA12878.wgs.HC.posteriors.vcf \
      *
    - * Refine the genotypes of a large panel based on the discovered allele frequency
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * 

    Refine the genotypes of a large panel based on the discovered allele frequency

    + *
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
      *   -V input.vcf \
      *   -o output.withPosteriors.vcf
    + * 
    * - * Apply frequency and HWE-based priors to the genotypes of a family without including the family allele counts - * in the allele frequency estimates - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

    Apply frequency and HWE-based priors to the genotypes of a family without including the family allele counts + * in the allele frequency estimates the genotypes of a large panel based on the discovered allele frequency

    + *
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
      *   -V input.vcf \
      *   -o output.withPosteriors.vcf \
      *   --ignoreInputSamples
    + * 
    * - * Calculate the posterior genotypes of a callset, and impose that a variant *not seen* in the external panel - * is tantamount to being AC=0, AN=100 within that panel - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

    Calculate the posterior genotypes of a callset, and impose that a variant *not seen* in the external panel + * is tantamount to being AC=0, AN=100 within that panel

    + *
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
      *   -supporting external.panel.vcf \
      *   -V input.vcf \
    - *   -o output.withPosteriors.vcf
    + *   -o output.withPosteriors.vcf \
      *   --numRefSamplesIfNoCall 100
    - *   
    - * Apply only family priors to a callset
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    - *   -T CalculateGenotypePosteriors \
    - *   -V input.vcf \
    - *   --skipPopulationPriors
    - *   -ped family.ped
    - *   -o output.withPosteriors.vcf 
    + * 
    * + *

    Apply only family priors to a callset

    + *
    + * java -jar GenomeAnalysisTK.jar \
    + *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
    + *   -V input.vcf \
    + *   --skipPopulationPriors \
    + *   -ped family.ped \
    + *   -o output.withPosteriors.vcf
      * 
    * */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java index 7e7926f3a..443c8d6b9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java @@ -74,38 +74,39 @@ import htsjdk.variant.vcf.*; import java.util.*; /** - * Combines any number of gVCF files that were produced by the Haplotype Caller into a single joint gVCF file. + * Combine per-sample gVCF files produced by HaplotypeCaller into a multi-sample gVCF file * *

    * CombineGVCFs is meant to be used for hierarchical merging of gVCFs that will eventually be input into GenotypeGVCFs. * One would use this tool when needing to genotype too large a number of individual gVCFs; instead of passing them * all in to GenotypeGVCFs, one would first use CombineGVCFs on smaller batches of samples and then pass these combined - * gVCFs to GenotypeGVCFs. - * - * Note that this tool cannot work with just any gVCF files - they must have been produced with the Haplotype Caller - * as part of the "single sample discovery" pipeline using the '-ERC GVCF' mode, which uses a sophisticated reference - * model to produce accurate genotype likelihoods for every position in the target. + * gVCFs to GenotypeGVCFs.

    * *

    Input

    *

    - * One or more Haplotype Caller gVCFs to combine. + * Two or more Haplotype Caller gVCFs to combine. *

    * *

    Output

    *

    - * A combined VCF. + * A combined multisample gVCF. *

    * - *

    Examples

    + *

    Usage example

    *
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CombineGVCFs \
    - *   --variant gvcf1.vcf \
    - *   --variant gvcf2.vcf \
    - *   -o mergeGvcf.vcf
    + *   -R reference.fasta \
    + *   --variant sample1.g.vcf \
    + *   --variant sample2.g.vcf \
    + *   -o cohort.g.vcf
      * 
    * + *

    Caveat

    + *

    Only gVCF files produced by HaplotypeCaller (or CombineGVCFs) can be used as input for this tool. Some other + * programs produce files that they call gVCFs but those lack some important information (accurate genotype likelihoods + * for every position) that GenotypeGVCFs requires for its operation.

    + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=1)) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index ea7c4671c..032ee5f0c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -85,22 +85,19 @@ import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import java.util.*; /** - * Genotypes any number of gVCF files that were produced by the Haplotype Caller into a single joint VCF file. + * Perform joint genotyping on gVCF files produced by HaplotypeCaller * *

    - * GenotypeGVCFs merges gVCF records that were produced as part of the reference model-based variant discovery pipeline (see documentation for more details) using - * the '-ERC GVCF' or '-ERC BP_RESOLUTION' mode of the HaplotypeCaller. This tool performs the multi-sample joint aggregation - * step and merges the records together in a sophisticated manner. - * - * At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods, - * re-genotype the newly merged record, and then re-annotate it. - * - * Note that this tool cannot work with just any gVCF files - they must have been produced with the HaplotypeCaller, - * which uses a sophisticated reference model to produce accurate genotype likelihoods for every position in the target. + * GenotypeGVCFs merges gVCF records that were produced as part of the Best Practices workflow for variant discovery + * (see Best Practices documentation for more details) using the '-ERC GVCF' or '-ERC BP_RESOLUTION' mode of the + * HaplotypeCaller, or result from combining such gVCF files using CombineGVCFs. This tool performs the multi-sample + * joint aggregation step and merges the records together in a sophisticated manner: at each position of the input + * gVCFs, this tool will combine all spanning records, produce correct genotype likelihoods, re-genotype the newly + * merged record, and then re-annotate it.

    * *

    Input

    *

    - * One or more Haplotype Caller gVCFs to genotype. + * One or more HaplotypeCaller gVCFs to genotype. *

    * *

    Output

    @@ -108,16 +105,25 @@ import java.util.*; * A combined, genotyped VCF. *

    * - *

    Examples

    + *

    Usage example

    *
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * java -jar GenomeAnalysisTK.jar \
      *   -T GenotypeGVCFs \
    - *   --variant gvcf1.vcf \
    - *   --variant gvcf2.vcf \
    + *   -R reference.fasta \
    + *   --variant sample1.g.vcf \
    + *   --variant sample2.g.vcf \
      *   -o output.vcf
      * 
    * + *

    Caveat

    + *

    Only gVCF files produced by HaplotypeCaller (or CombineGVCFs) can be used as input for this tool. Some other + * programs produce files that they call gVCFs but those lack some important information (accurate genotype likelihoods + * for every position) that GenotypeGVCFs requires for its operation.

    + * + *

    Special note on ploidy

    + *

    This tool is able to handle any ploidy (or mix of ploidies) intelligently; there is no need to specify ploidy + * for non-diploid organisms.

    + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=-10,stop=10)) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java index 8b94a56a6..134f5e514 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java @@ -84,13 +84,14 @@ import java.util.HashSet; import java.util.Set; /** - * Regenotypes the variants from a VCF. VCF records must contain PLs or GLs. + * Regenotypes the variants from a VCF containing PLs or GLs. * *

    - * This tool triggers re-genotyping of the samples through the Exact Allele Frequency calculation model. Note that this is truly the - * mathematically correct way to select samples from a larger set (especially when calls were generated from low coverage sequencing data); - * using the hard genotypes to select (i.e. the default mode of SelectVariants) can lead to false positives when errors are confused for - * variants in the original genotyping. This functionality used to comprise the --regenotype option in SelectVariants but we pulled it out + * This tool triggers re-genotyping of the samples through the Exact Allele Frequency calculation model. Note that + * this is truly the mathematically correct way to select samples from a larger set (especially when calls were + * generated from low coverage sequencing data); using the hard genotypes to select (i.e. the default mode of + * SelectVariants) can lead to false positives when errors are confused for variants in the original genotyping. + * This functionality used to comprise the --regenotype option in SelectVariants but we pulled it out * into its own tool for technical purposes. * *

    Input

    @@ -103,11 +104,11 @@ import java.util.Set; * A re-genotyped VCF. *

    * - *

    Examples

    + *

    Usage example

    *
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * java -jar GenomeAnalysisTK.jar \
      *   -T RegenotypeVariants \
    + *   -R reference.fasta \
      *   --variant input.vcf \
      *   -o output.vcf
      * 
    diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java index 05b770005..328960390 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java @@ -43,7 +43,7 @@ import java.util.*; /** * All command line parameters accepted by all tools in the GATK. * - *

    Info for general users

    + *

    Info for end users

    * *

    This is a list of options and parameters that are generally available to all tools in the GATK.

    * @@ -51,7 +51,7 @@ import java.util.*; * argument is only meant to be used with a subset of tools, and the -pedigree argument will only be effectively used * by a subset of tools as well. Some arguments conflict with others, and some conversely are dependent on others. This * is all indicated in the detailed argument descriptions, so be sure to read those in their entirety rather than just - * skimming the one-line summaey in the table.

    + * skimming the one-line summary in the table.

    * *

    Info for developers

    * diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java index ddcf373e1..4fec3e240 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java @@ -63,10 +63,11 @@ public class GATKArgumentCollection { @Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false) public List samFiles = new ArrayList<>(); - @Hidden + @Advanced @Argument(fullName = "showFullBamList",doc="Emit a log entry (level INFO) containing the full list of sequence data files to be included in the analysis (including files inside .bam.list files).") public Boolean showFullBamList = false; + @Advanced @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0) public Integer readBufferSize = null; @@ -79,11 +80,11 @@ public class GATKArgumentCollection { /** * By default, GATK generates a run report that is uploaded to a cloud-based service. This report contains basic * statistics about the run (which tool was used, whether the run was successful etc.) that help us for debugging - * and development. Up to version 3.2-2 the run report contains a record of the username and hostname associated + * and development. Up to version 3.3-0 the run report contains a record of the username and hostname associated * with the run, but it does **NOT** contain any information that could be used to identify patient data. * Nevertheless, if your data is subject to stringent confidentiality clauses (no outside communication) or if your * run environment is not connected to the internet, you can disable the reporting system by seeting this option to - * "NO_ET". You will also need to request a key using the online request form on our website (se FAQs). + * "NO_ET". You will also need to request a key using the online request form on our website (see FAQs). */ @Argument(fullName = "phone_home", shortName = "et", doc="Run reporting mode", required = false) public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; @@ -95,9 +96,10 @@ public class GATKArgumentCollection { public File gatkKeyFile = null; /** - * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary tag that can be - * used to group together runs during later analysis. One use of this capability is to tag runs as GATK - * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. + * The GATKRunReport supports tagging GATK runs with an arbitrary tag that can be + * used to group together runs during later analysis (as of GATK 2.2) . One use of this capability is to tag + * runs as GATK performance tests, so that the performance of the GATK over time can be assessed from the logs + * directly. * * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find * meaningful. @@ -164,9 +166,9 @@ public class GATKArgumentCollection { // // -------------------------------------------------------------------------------------------------------------- /** - * There are several ways to downsample reads, i.e. to removed reads from the pile of reads that will be used for analysis. - * See the documentation of the individual downsampling options for details on how they work. Note that Many GATK tools - * specify a default downsampling type and target, but this behavior can be overridden from command line using the + * There are several ways to downsample reads, i.e. to remove reads from the pile of reads that will be used for analysis. + * See the documentation of the individual downsampling options for details on how they work. Note that many GATK tools + * specify a default downsampling type and target, but this behavior can be overridden from the command line using the * downsampling arguments. */ @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of read downsampling to employ at a given locus", required = false) @@ -233,11 +235,13 @@ public class GATKArgumentCollection { // BAQ arguments // // -------------------------------------------------------------------------------------------------------------- + @Advanced @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; /** * Phred-scaled gap open penalty for BAQ calculation. Although the default value is 40, a value of 30 may be better for whole genome call sets. */ + @Advanced @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0) public double BAQGOP = BAQ.DEFAULT_GOP; @@ -328,7 +332,7 @@ public class GATKArgumentCollection { * Any value greater than zero will be used to recalculate the quantization using that many levels. * Negative values mean that we should quantize using the recalibration report's quantization level. */ - @Hidden + @Advanced @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) public int quantizationLevels = 0; @@ -352,11 +356,13 @@ public class GATKArgumentCollection { * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. */ + @Advanced @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)", required = false, minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; /** * If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score. */ + @Advanced @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false) public double globalQScorePrior = -1.0; @@ -398,16 +404,16 @@ public class GATKArgumentCollection { /** * For expert users only who know what they are doing. We do not support usage of this argument, so we may refuse to help you if you use it and something goes wrong. The one exception to this rule is ALLOW_N_CIGAR_READS, which is necessary for RNAseq analysis. */ + @Advanced @Argument(fullName = "unsafe", shortName = "U", doc = "Enable unsafe operations: nothing will be checked at runtime", required = false) public ValidationExclusion.TYPE unsafe; /** - * UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking + * Not recommended for general use. Disables both auto-generation of index files and index file locking * when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index * generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it * for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general * because it allows reading from index files without first acquiring a lock. */ - @Hidden @Advanced @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", doc = "Disable both auto-generation of index files and index file locking", @@ -451,6 +457,7 @@ public class GATKArgumentCollection { required = false) public boolean simplifyBAM = false; + @Advanced @Argument(fullName = "disable_bam_indexing", doc = "Turn off on-the-fly creation of indices for output BAM files.", required = false) public boolean disableBAMIndexing = false; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java index 05c02c722..0b7d1a905 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java @@ -33,16 +33,29 @@ import htsjdk.samtools.SAMRecord; import java.util.Iterator; /** - * Filter out reads with wonky cigar strings. + * Filter out reads with wonky CIGAR strings * - * - No reads with a different length and cigar length - * - No reads with Hard/Soft clips in the middle of the cigar - * - No reads starting with deletions (with or without preceding clips) - * - No reads ending in deletions (with or without follow-up clips) - * - No reads that are fully hard or soft clipped - * - No reads that have consecutive indels in the cigar (II, DD, ID or DI) + *

    This read filter will filter out the following cases:

    + *
      + *
    • different length and cigar length
    • + *
    • Hard/Soft clips in the middle of the cigar
    • + *
    • starting with deletions (with or without preceding clips)
    • + *
    • ending in deletions (with or without follow-up clips)
    • + *
    • fully hard or soft clipped
    • + *
    • consecutive indels in the cigar (II, DD, ID or DI)
    • + *
    * - * ps: apparently an empty cigar is okay... + *

    Usage example

    + * + *

    Enable the bad cigar filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf BadCigar
    + * 
    * * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java index c25d8d9ca..562e50ea9 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java @@ -28,13 +28,40 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads whose mate maps to a different contig. + * Filter out reads whose mate maps to a different contig + * + *

    This filter is intended to ensure that only reads that are likely to be mapped in the right place, and therefore + * to be informative, will be used in analysis. If mates in a pair are mapping to different contigs, it is likely that + * at least one of them is in the wrong place. One exception is you are using a draft genome assembly in which the + * chromosomes are fragmented into many contigs; then you may legitimately have reads that are correctly mapped but are + * on different contigs than their mate. This read filter can be disabled from the command line using the -drf argument. + *

    + * + *

    Enable the bad mate filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf BadMate
    + * 
    + * + *

    Disable the bad mate filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -drf BadMate
    + * 
    * * @author ebanks * @version 0.1 */ -public class BadMateFilter extends ReadFilter { +public class BadMateFilter extends DisableableReadFilter { public boolean filterOut(final SAMRecord rec) { return hasBadMate(rec); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java index aa45b250a..310f1dee3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java @@ -53,7 +53,32 @@ import htsjdk.samtools.SAMRecord; */ /** - * Filter out duplicate reads. + * Filter out duplicate reads + * + *

    This filter recognizes the SAM flag set by MarkDuplicates. It can be disabled from the command line if needed + * using the -drf argument.

    + * + *

    Usage examples

    + * + *

    Enable the duplicate read filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf DuplicateRead
    + * 
    + * + *

    Disable the duplicate read filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -drf DuplicateRead
    + * 
    * * @author rpoplin * @since Dec 9, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java index 2cc5e2a8b..fc5cdcb53 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java @@ -28,7 +28,9 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads that fail the vendor quality check. + * Filter out reads that fail the vendor quality check + * + *

    This filter recognizes the SAM flag corresponding to the vendor quality check.

    * * @author rpoplin * @since Jul 19, 2010 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java index 8b0f07624..d6e78a616 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java @@ -33,6 +33,22 @@ import org.broadinstitute.gatk.engine.filters.ReadFilter; /** * Only use reads from the specified library * + *

    This filter is useful for running on only a subset of the data as identified by a read group property. + * In the case of the library filter, the goal is usually to run quality control checks on a particular library.

    + * + *

    Usage example

    + * + *

    Enable the library read filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf LibraryRead \
    + *         -library library_name
    + * 
    + * * @author kcibul * @since Aug 15, 2012 * diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java index 6488a857a..05c6f564e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java @@ -34,7 +34,26 @@ import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; import org.broadinstitute.gatk.utils.exceptions.UserException; /** - * Filter out malformed reads. + * Filter out malformed reads + * + *

    This filter is applied automatically by all GATK tools in order to protect them from crashing on reads that are + * grossly malformed. There are a few issues (such as the absence of sequence bases) that will cause the run to fail with an + * error, but these cases can be preempted by setting flags that cause the problem reads to also be filtered.

    + * + *

    Usage example

    + * + *

    Set the malformed read filter to filter out reads that have no sequence bases

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -filterNoBases
    + * 
    + * + *

    Note that the MalformedRead filter itself does not need to be specified in the command line because it is set + * automatically.

    * * @author mhanna * @version 0.1 @@ -46,14 +65,14 @@ public class MalformedReadFilter extends ReadFilter { private SAMFileHeader header; - @Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "filter out reads with CIGAR containing the N operator, instead of stop processing and report an error.", required = false) + @Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false) boolean filterReadsWithNCigar = false; - @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false) + @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false) boolean filterMismatchingBaseAndQuals = false; - @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "if a read has no stored bases (i.e. a '*'), filter out the read instead of blowing up.", required = false) + @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false) boolean filterBasesNotStored = false; /** diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java index 67c62b975..58ec76660 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java @@ -29,7 +29,23 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filter out reads with low mapping qualities. + * Filter out reads with low mapping qualities + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *

    Set the mapping quality filter to filter out reads that have MAPQ < 15

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T HaplotypeCaller \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.vcf \
    + *         -rf MappingQuality \
    + *         -mmq 15
    + * 
    * * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java index 05df7fb0d..ff1542e41 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java @@ -29,7 +29,22 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.QualityUtils; /** - * Filter out mapping quality zero reads. + * Filter out reads with no mapping quality information + * + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MappingQualityUnavailable
    + * 
    * * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java index f3f703278..b0d40c074 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java @@ -28,7 +28,21 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out mapping quality zero reads. + * Filter out reads with mapping quality zero + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MappingQualityZero
    + * 
    * * @author hanna * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java index c6a79e1a3..20dda5427 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java @@ -28,7 +28,30 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads that are not paired, have their mate unmapped, are duplicates, fail vendor quality check or both mate and read are in the same strand. + * Filter out reads with bad pairing (and related) properties + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis. + * The following cases will be filtered out: + *

    + *
      + *
    • is not paired
    • + *
    • mate is unmapped
    • + *
    • is duplicate
    • + *
    • fails vendor quality check
    • + *
    • both mate and read are in the same strand orientation
    • + *
    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MateSameStrand
    + * 
    * * @author chartl * @since 5/18/11 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java index cca05ebc7..c7b512f2b 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java @@ -29,13 +29,28 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filter out reads that exceed a given max insert size + * Filter out reads that exceed a given insert size + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MaxInsertSize \
    + *         -maxInsert 10000
    + * 
    * * @author chartl * @since 5/2/11 */ public class MaxInsertSizeFilter extends ReadFilter { - @Argument(fullName = "maxInsertSize", shortName = "maxInsert", doc="Discard reads with insert size greater than the specified value, defaults to 1000000", required=false) + @Argument(fullName = "maxInsertSize", shortName = "maxInsert", doc="Insert size cutoff", required=false) private int maxInsertSize = 1000000; public boolean filterOut(SAMRecord record) { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java index 21b291bb3..0a7a2cdbf 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java @@ -28,7 +28,21 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads without read groups. + * Filter out reads without read group information + * + *

    Many GATK tools are dependent on having read group information in order to operate correctly. This filter excludes + * any reads that have not been appropriately identified.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MissingReadGroup
    + * 
    * * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java index 8297903d8..4e8a1dc2b 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java @@ -27,33 +27,22 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ /** - * Filter out reads that don't have base an original quality quality score tag (usually added by BQSR) + * Filter out reads that do not have an original quality quality score (OQ) tag + * + *

    The OQ tag can be added during the base recalibration process in order to preserve original information.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf NoOriginalQualityScores
    + * 
    * * @author rpoplin * @since Nov 19, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java index 4c8f412e2..55a697d3e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java @@ -28,7 +28,22 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out reads that are secondary alignments (not one of the best alignments) + * Filter out reads that are secondary alignments + * + *

    This filter recognizes the SAM flag that identifies secondary alignments (ie not the best alignment). + * It is intended to ensure that only reads that are likely to be mapped in the right place, and therefore to be + * informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf NotPrimaryAlignment
    + * 
    * * @author rpoplin * @since Dec 9, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java index 79f16a5fc..f1b375835 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java @@ -30,7 +30,21 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; /** - * Filter out 454 reads. + * Filter out reads produced by 454 technology + * + *

    Reads produced by 454 technology should not be processed by the GATK's indel realignment tools. This filter is + * applied by those tools to enforce that rule.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf Platform454
    + * 
    * * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java index 8236cc219..7ca07d35d 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java @@ -31,7 +31,23 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; /** - * Filter out PL matching reads. + * Filter out reads that were generated by a specific sequencing platform + * + *

    This filter is useful for running on only a subset of the data as identified by a read group property. + * In the case of the platform filter, the goal is usually to blacklist certain sequencing technologies at certain processing steps + * if we know there is an incompatibility problem (like 454 and indel realignment, which is special-cased).

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf Platform \
    + *         -PLFilterName platform_name
    + * 
    * * @author ebanks * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java index 4a6781ff5..b0e0bbebb 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java @@ -1,28 +1,28 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMReadGroupRecord; @@ -33,7 +33,11 @@ import java.util.HashSet; import java.util.Set; /** - * Filter out reads that have blacklisted platform unit tags. (See code documentation for how to create the blacklist). + * Filter out reads with blacklisted platform unit tags + * + *

    This filter is useful for running on only a subset of the data as identified by a read group property. + * In the case of the platform unit filter, the goal is usually to blacklist certain runs if we know there was a problem with + * a particular sequencing machine.

    * * @author asivache * @since Sep 21, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java index 7c6bfb0e3..9f815cf72 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java @@ -36,10 +36,23 @@ import java.util.*; import java.util.Map.Entry; /** - * Removes records matching the read group tag and exact match string. - * For example, this filter value: - * PU:1000G-mpimg-080821-1_1 - * would filter out a read with the read group PU:1000G-mpimg-080821-1_1 + * Filter out reads matching a read group tag value + * + *

    This filter is useful for running on only a subset of the data as identified by a read group property, + * using expression matching against the read group tags.

    + * + *

    Usage example

    + * + *

    Set the read group filter to blacklist read groups that have the PU tag "1000G-mpimg-080821-1_1"

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf ReadGroupBlackList \
    + *         -rgbl PU:1000G-mpimg-080821-1_1
    + * 
    */ public class ReadGroupBlackListFilter extends ReadFilter { private Set>> filterEntries; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java index 1e44df806..f9a6fab57 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java @@ -29,7 +29,22 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filters out reads whose length is >= some value or < some value. + * Filter out reads based on length + * + *

    This filter is useful for running on only reads that are longer (or shorter) than the given threshold sizes.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf ReadLength \
    + *         -minRead 50 \
    + *         -maxRead 101
    + * 
    * * @author mhanna * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java index 23a5151de..cdee7e14b 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java @@ -29,13 +29,28 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filter out all reads except those with this read name + * Only use reads with this read name + * + *

    This filter is useful for isolating a particular read, pair of reads or or set of alignments for a given read + * when troubleshooting issues where the error message provided a culprit name.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf ReadName \
    + *         -rn read_name
    + * 
    * * @author chartl * @since 9/19/11 */ public class ReadNameFilter extends ReadFilter { - @Argument(fullName = "readName", shortName = "rn", doc="Filter out all reads except those with this read name", required=true) + @Argument(fullName = "readName", shortName = "rn", doc="Read name to whitelist", required=true) private String readName; public boolean filterOut(final SAMRecord rec) { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java index fd2876654..292803d1c 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java @@ -29,7 +29,23 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Filters out reads whose strand is negative or positive + * Filter out reads based on strand orientation + * + *

    This filter is useful for isolating reads from only forward or reverse strands. By default, it filters out reads + * from the negative (reverse) strand. This logic can be reversed by using the -filterPositive flag.

    + * + *

    Usage example

    + * + *

    Set the read strand filter to filter out positive (forward) strand reads

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf ReadStrand \
    + *         -filterPositive
    + * 
    * * @author chartl * @version 0.1 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java index 0c8a93a83..89be38db7 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java @@ -29,7 +29,7 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * A read filter (transformer) that sets all reads mapping quality to a given value. + * Set the mapping quality of all reads to a given value. * *

    * If a BAM file contains erroneous or missing mapping qualities (MAPQ), this read transformer will set all your @@ -55,16 +55,18 @@ import org.broadinstitute.gatk.utils.commandline.Argument; * BAM file(s) *

    * - * *

    Output

    *

    - * BAM file(s) with all reads mapping qualities reassigned + * BAM file(s) with the mapping qualities of all reads reassigned to the specified value *

    * - *

    Examples

    + *

    Usage example

    *
      *  java -jar GenomeAnalysisTK.jar \
      *      -T PrintReads \
    + *      -R reference.fasta \
    + *      -I input.bam \
    + *      -o output.file \
      *      -rf ReassignMappingQuality \
      *      -DMQ 35
      *  
    diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java index f07f197c6..2ff1d5a4e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java @@ -29,7 +29,7 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * A read filter (transformer) that changes a given read mapping quality to a different value. + * Set the mapping quality of reads with a given value to another given value. * *

    * This read transformer will change a certain read mapping quality to a different value without affecting reads that @@ -57,12 +57,15 @@ import org.broadinstitute.gatk.utils.commandline.Argument; * BAM file(s) with one read mapping quality selectively reassigned as desired *

    * - *

    Examples

    + *

    Usage example

    *
    - *    java -jar GenomeAnalysisTK.jar
    - *      -T PrintReads
    - *      -rf ReassignOneMappingQuality
    - *      -RMQF 255
    + *    java -jar GenomeAnalysisTK.jar \
    + *      -T PrintReads \
    + *      -R reference.fasta \
    + *      -I input.bam \
    + *      -o output.file \
    + *      -rf ReassignOneMappingQuality \
    + *      -RMQF 255 \
      *      -RMQT 60
      *  
    * diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java index 2ec0112ab..ab63e1e00 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java @@ -32,7 +32,23 @@ import org.broadinstitute.gatk.utils.commandline.Argument; import java.util.Set; /** - * Filter out all reads except those with this sample + * Only use reads belonging to a specific sample + * + *

    This filter is useful for isolating data from one particular sample in a multisample file.

    + * + *

    Usage example

    + * + *

    Use only reads from the sample named NA12878

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf Sample \
    + *         -goodSM NA12878
    + * 
    + * */ public class SampleFilter extends ReadFilter { @Argument(fullName = "sample_to_keep", shortName = "goodSM", doc="The name of the sample(s) to keep, filtering out all others", required=true) diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java index 5a9d21476..58cf9183d 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java @@ -30,7 +30,22 @@ import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; /** - * Only use reads from the specified read group. + * Only use reads from the specified read group + * + *

    This filter is useful for isolating data from one particular read group (usually a single lane).

    + * + *

    Usage example

    + * + *

    Use only reads from the read group with ID "read_group_1

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf SingleReadGroup \
    + *         -goodRG read_group_1
    + * 
    * * @author rpoplin * @since Nov 27, 2009 diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java index e9cc30276..d5f8d30ff 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java @@ -28,7 +28,22 @@ package org.broadinstitute.gatk.engine.filters; import htsjdk.samtools.SAMRecord; /** - * Filter out unmapped reads. + * Filter out unmapped reads + * + * + *

    This filter recognizes the SAM flag corresponding to being unmapped. It is intended to ensure that only + * reads that are likely to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf UnmappedRead
    + * 
    * * @author rpoplin * @since Dec 9, 2009 diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java index 155566aea..391b0202f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java @@ -56,24 +56,23 @@ import java.util.*; /** * - * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples + * Concatenate VCF files of non-overlapping genome intervals, all with the same set of samples * *

    * The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. * This tool concatenates the scattered output VCF files. It assumes that: - * - All the input VCFs (or BCFs) contain the same samples in the same order. - * - The variants in each input file are from non-overlapping (scattered) intervals. - * - * When the input files are already sorted based on the intervals start positions, use -assumeSorted. - * - * Note: Currently the tool is more efficient when working with VCFs; we will work to make it as efficient for BCFs. - * + *

      + *
    • All the input VCFs (or BCFs) contain the same samples in the same order.
    • + *
    • The variants in each input file are from non-overlapping (scattered) intervals.
    • + *
    *

    + *

    When the input files are already sorted based on the intervals start positions, use -assumeSorted.

    * *

    Input

    *

    - * One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order). - * If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag. + * Two or more variant sets to combine. They should be of non-overlapping genome intervals and with the same + * samples (sorted in the same order). If the files are ordered according to the appearance of intervals in the ref + * genome, then one can use the -assumeSorted flag. *

    * *

    Output

    @@ -86,16 +85,19 @@ import java.util.*; * invoke it is a little different from other GATK tools (see example below), and it does not accept any of the * classic "CommandLineGATK" arguments.

    * - *

    Example

    + *

    Usage example

    *
      * java -cp GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants \
    - *    -R ref.fasta \
    + *    -R reference.fasta \
      *    -V input1.vcf \
      *    -V input2.vcf \
      *    -out output.vcf \
      *    -assumeSorted
      * 
    * + *

    Caveat

    + *

    Currently the tool is more efficient when working with VCFs than with BCFs.

    + * * @author Ami Levy Moonshine * @since Jan 2012 */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java index 19c0d2697..1c99fa8fc 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java @@ -65,7 +65,7 @@ import java.util.Set; *
      *
    • This annotation will only work properly for biallelic heterozygous calls.
    • *
    • This annotation cannot currently be calculated for indels.
    • - *
    • tThe reasoning underlying this annotation only applies to germline variants in DNA sequencing data. In somatic/cancer analysis, divergent ratios are expected due to tumor heterogeneity. In RNAseq analysis, divergent ratios may indicate differential allele expression.
    • + *
    • The reasoning underlying this annotation only applies to germline variants in DNA sequencing data. In somatic/cancer analysis, divergent ratios are expected due to tumor heterogeneity. In RNAseq analysis, divergent ratios may indicate differential allele expression.
    • *
    • As stated above, this annotation is experimental and should be interpreted with caution as we cannot guarantee that it is appropriate. Basically, use it at your own risk.
    • *
    *

    Related annotations

    diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java index 995279927..1d4b7a002 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java @@ -49,8 +49,7 @@ import java.util.Map; *

    This annotation tells you what fraction of reads have a mapping quality of less than the given threshold of 10 (including 0). Note that certain tools may impose a different minimum mapping quality threshold. For example, HaplotypeCaller excludes reads with MAPQ<20.

    * *

    Calculation

    - *

    $$ LowMQ = \frac{# reads with MAPQ=0 + # reads with MAPQ<10}{total # reads} $$ - *

    + * $$ LowMQ = \frac{# reads with MAPQ=0 + # reads with MAPQ<10}{total # reads} $$ * *

    Related annotations

    *
      diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java index 5e632dc7c..13057381e 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java @@ -49,6 +49,9 @@ import java.util.List; * *

      This annotation gives you the count of all reads that have MAPQ = 0 for each sample. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.

      * + *

      Caveat

      + *

      This annotation is excluded by HaplotypeCaller because HC filters out all reads with MQ0 upfront, so the annotation would always return a value of 0 anyway.

      + * *

      Related annotations

      *
        *
      • MappingQualityZero gives the count of reads with MAPQ=0 across all samples.
      • diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java index 5ec474119..465fa285f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java @@ -48,7 +48,8 @@ import java.util.Map; * *

        N occurs in a sequence when the sequencer does not have enough information to determine which base it should call. The presence of many Ns at the same site lowers our confidence in any calls made there, because it suggests that there was some kind of technical difficulty that interfered with the sequencing process.

        * - *

        Note that in GATK versions 3.2 and earlier, this annotation only counted N bases from reads generated with SOLiD technology. This functionality was generalized for all sequencing platforms in GATK version 3.3.

        + *

        Note

        + *

        In GATK versions 3.2 and earlier, this annotation only counted N bases from reads generated with SOLiD technology. This functionality was generalized for all sequencing platforms in GATK version 3.3.

        * *

        Related annotations

        *
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java index 90cd9ec47..9da3de861 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java @@ -46,9 +46,9 @@ import java.util.regex.Pattern; /** * Top effect from SnpEff functional predictions * - *

          This annotation processes the output of the SnpEff functional prediction tool to select only the predicted effect with the highest biological impact. The SnpEff output must be provided on the command line by specifying "--snpEffFile filename.vcf". See http://snpeff.sourceforge.net/ for more information about the SnpEff tool

          . + *

          This annotation processes the output of the SnpEff functional prediction tool to select only the predicted effect with the highest biological impact. The SnpEff output must be provided on the command line by specifying "--snpEffFile filename.vcf". See http://snpeff.sourceforge.net/ for more information about the SnpEff tool.

          * - *

          Caveats

          + *

          Caveat

          * *
          • This annotation currently only supports output from SnpEff version 2.0.5.
          * diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java index f2d60bb02..c748f75ce 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java @@ -50,11 +50,13 @@ import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import java.util.*; /** - * Annotates variant calls with context information. + * Annotate variant calls with context information * *

          - * VariantAnnotator is a GATK tool for annotating variant calls based on their context. - * The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself. + * This tool is designed to annotate variant calls based on their context (ass opposed to functional annotation). + * Various annotation modules are available; see the + * documentation + * for a complete list. * *

          Input

          *

          @@ -66,15 +68,15 @@ import java.util.*; * An annotated VCF. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
          + *   -R reference.fasta \
            *   -T VariantAnnotator \
            *   -I input.bam \
            *   -o output.vcf \
            *   -A Coverage \
          - *   --variant input.vcf \
          + *   -V input.vcf \
            *   -L input.vcf \
            *   --dbsnp dbsnp.vcf
            * 
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java index 332486b1a..cc12172a1 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java @@ -47,25 +47,25 @@ import java.io.PrintStream; /** - * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome - *

          + * Collect statistics on callable, uncallable, poorly mapped, and other parts of the genome + * *

          - * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system + * A very common question about a NGS set of reads is what areas of the genome are considered callable. This tool * considers the coverage at each locus and emits either a per base state or a summary interval BED file that * partitions the genomic intervals into the following callable states: *

          *
          REF_N
          - *
          the reference base was an N, which is not considered callable the GATK
          + *
          The reference base was an N, which is not considered callable the GATK
          *
          PASS
          - *
          the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
          + *
          The base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
          *
          NO_COVERAGE
          - *
          absolutely no reads were seen at this locus, regardless of the filtering parameters
          + *
          Absolutely no reads were seen at this locus, regardless of the filtering parameters
          *
          LOW_COVERAGE
          - *
          there were less than min. depth bases at the locus, after applying filters
          + *
          There were fewer than min. depth bases at the locus, after applying filters
          *
          EXCESSIVE_COVERAGE
          - *
          more than -maxDepth read at the locus, indicating some sort of mapping problem
          + *
          More than -maxDepth read at the locus, indicating some sort of mapping problem
          *
          POOR_MAPPING_QUALITY
          - *
          more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
          + *
          More than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
          *
          *

          *

          @@ -76,22 +76,19 @@ import java.io.PrintStream; *

          *

          Output

          *

          - *

            - *
          • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
          • - *
          • -summary: a table of callable status x count of all examined bases
          • - *
          + * A file with the callable status covering each base and a table of callable status x count of all examined bases *

          - *

          - *

          Examples

          + *

          Usage example

          *
            *  java -jar GenomeAnalysisTK.jar \
            *     -T CallableLoci \
          - *     -I my.bam \
          - *     -summary my.summary \
          - *     -o my.bed
          + *     -R reference.fasta \
          + *     -I myreads.bam \
          + *     -summary table.txt \
          + *     -o callable_status.bed
            * 
          *

          - * would produce a BED file (my.bed) that looks like: + * would produce a BED file that looks like: *

          *

            *     20 10000000 10000864 PASS
          @@ -107,14 +104,13 @@ import java.io.PrintStream;
            *     20 10012552 10012554 PASS
            *     20 10012555 10012557 LOW_COVERAGE
            *     20 10012558 10012558 PASS
          - *     et cetera...
            * 
          * as well as a summary table that looks like: *

          *

            *                        state nBases
            *                        REF_N 0
          - *                     PASS 996046
          + *                         PASS 996046
            *                  NO_COVERAGE 121
            *                 LOW_COVERAGE 928
            *           EXCESSIVE_COVERAGE 0
          @@ -131,7 +127,7 @@ public class CallableLoci extends LocusWalkerThis tool can be used to evaluate how different sequence datasets compare in terms of "callability"
          + * based on the output of the CallableLoci tool. 

          + * + * + *

          Input

          + *

          + * Two files to compare, output by two runs of CallableLoci + *

          + * + *

          Output

          + *

          + * A table showing the callability status of each interval of interest in the two comparison sets and whether they match. + *

          + * + *

          Usage example

          + *
          + * java -jar GenomeAnalysisTK.jar \
          + *   -R reference.fasta \
          + *   -T CompareCallableLoci \
          + *   -comp1 callable_loci_1.bed \
          + *   -comp2 callable_loci_2.bed \
          + *   [-L input.intervals \]
          + *   -o comparison.table
          + * 
          + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class CompareCallableLoci extends RodWalker, long[][]> { @@ -103,7 +129,7 @@ public class CompareCallableLoci extends RodWalker bindings = tracker.getValues(rodBinding); if ( bindings.size() != 1 ) { - throw new UserException.MalformedFile(String.format("%s track isn't a properly formated CallableBases object!", rodBinding.getName())); + throw new UserException.MalformedFile(String.format("%s track isn't a properly formatted CallableBases object!", rodBinding.getName())); } BEDFeature bed = bindings.get(0); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java index 92395a4f9..098e81c94 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java @@ -62,46 +62,37 @@ import java.util.*; * This tool processes a set of bam files to determine coverage at different levels of partitioning and * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, - * and/or percentage of bases covered to or beyond a threshold. - * Additionally, reads and bases can be filtered by mapping or base quality score. + * and/or percentage of bases covered to or beyond a threshold. Additionally, reads and bases can be filtered by + * mapping or base quality score. + *

          * *

          Input

          - *

          - * One or more bam files (with proper headers) to be analyzed for coverage statistics - *

          - *

          - *(Optional) A REFSEQ Rod to aggregate coverage to the gene level - *

          - * (for information about creating the REFSEQ Rod, please consult the online documentation) - *

          + *
            + *
          • One or more bam files (with proper headers) to be analyzed for coverage statistics
          • + *
          • (Optional) A REFSEQ file to aggregate coverage to the gene level (for information about creating the REFSEQ Rod, please consult the online documentation)
          • + *
          + *

          Output

          *

          * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: - *

          - * - no suffix: per locus coverage - *

          - * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases - *

          - * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases - *

          - * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval - *

          - * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples - *

          - * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene - *

          - * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples - *

          - * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases - *

          - * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases *

          + *
            + *
          • no suffix: per locus coverage
          • + *
          • _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
          • + *
          • _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
          • + *
          • _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
          • + *
          • _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
          • + *
          • _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
          • + *
          • _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
          • + *
          • _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
          • + *
          • _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
          • + *
          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T DepthOfCoverage \
          + *   -R reference.fasta \
            *   -o file_name_base \
            *   -I input_bams.list
            *   [-geneList refSeq.sorted.txt] \
          @@ -180,7 +171,7 @@ public class DepthOfCoverage extends LocusWalkerInput
          @@ -54,11 +54,11 @@ import java.util.List;
            *  GC content calculations per interval.
            * 

          * - *

          Example

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T GCContentByInterval \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -o output.txt \
            *   -L input.intervals
            * 
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/CoveredByNSamplesSites.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/CoveredByNSamplesSites.java index e596cdd70..4e3e91fc8 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/CoveredByNSamplesSites.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/CoveredByNSamplesSites.java @@ -50,30 +50,30 @@ import java.io.*; import java.util.Collection; /** - * Print intervals file with all the variant sites for which most of the samples have good coverage + * Report well-covered intervals * *

          - * CoveredByNSamplesSites is a GATK tool for filtering out sites based on their coverage. - * The sites that pass the filter are printed out to an intervals file. - * - * See argument defaults for what constitutes "most" samples and "good" coverage. These parameters can be modified from the command line. + * This tool evaluates whether sites are well-covered or not according to specific coverage quality parameters, and + * outputs a list of intervals that are considered well-covered, i.e. where most samples have good coverage. This is + * useful for masking out poorly-covered sites where we cannot expect meaningful results in downstream analyses. See + * argument defaults for what constitutes "most" samples and "good" coverage. *

          * *

          Input

          *

          - * A variant file and optionally min coverage and sample percentage values. + * A variant file and optionally, minimum coverage and sample percentage values. *

          * *

          Output

          *

          - * An intervals file. + * An list of well-covered intervals. *

          * - *

          Example

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CoveredByNSamplesSites \
          + *   -R reference.fasta \
            *   -V input.vcf \
            *   -out output.intervals \
            *   -minCov 15
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycle.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycle.java
          index b0b18abc6..129fa00ad 100644
          --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycle.java
          +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycle.java
          @@ -83,19 +83,18 @@ import java.io.PrintStream;
            *      
          *

          * - *

          Example

          + *

          Usage example

          *
          - *    java
          - *      -jar GenomeAnalysisTK.jar
          - *      -T ErrorRatePerCycle
          - *      -R human_g1k_v37.fasta
          - *      -I my_sequence_reads.bam
          + *    java -jar GenomeAnalysisTK.jar \
          + *      -T ErrorRatePerCycle \
          + *      -R reference.fasta \
          + *      -I my_sequence_reads.bam \
            *      -o error_rates.gatkreport.txt
            *  
          * *

          Caveat

          * - *

          Note that when it is run on paired-end sequence data, this tool only uses the first read in a pair.

          + *

          When it is run on paired-end sequence data, this tool only uses the first read in a pair.

          * * @author Kiran Garimella, Mark DePristo */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadGroupProperties.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadGroupProperties.java index ba4542768..987d1a0e1 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadGroupProperties.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadGroupProperties.java @@ -45,13 +45,15 @@ import java.util.HashMap; import java.util.Map; /** - * Emits a GATKReport containing read group, sample, library, platform, center, sequencing data, - * paired end status, simple read type name (e.g. 2x76) median insert size and median read length - * for each read group in every provided BAM file + * Collect statistics about read groups and their properties * - * Note that this walker stops when all read groups have been observed at least a few thousand times so that - * the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate - * timeframe. + *

          This tool emits a GATKReport containing read group, sample, library, platform, center, sequencing data, + * paired end status, simple read type name (e.g. 2x76) median insert size and median read length + * for each read group in every provided BAM file.

          + * + *

          Note that this walker stops when all read groups have been observed at least a few thousand times so that + * the median statistics are well determined. It is safe to run it on whole genome sequence data and expect it to + * finish in an appropriate timeframe.

          * *

          Input

          *

          @@ -86,14 +88,14 @@ import java.util.Map; *

          *

          * - *

          Examples

          + *

          Usage example

          *
          - *    java
          - *      -jar GenomeAnalysisTK.jar
          - *      -T ReadGroupProperties
          - *      -I example1.bam -I example2.bam etc
          - *      -R reference.fasta
          - *      -o example.gatkreport.txt
          + *    java -jar GenomeAnalysisTK.jar \
          + *      -T ReadGroupProperties \
          + *      -R reference.fasta \
          + *      -I example1.bam \
          + *      -I example2.bam \
          + *      -o readgroup_report.grp
            *  
          * * @author Mark DePristo diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java index 438b38e36..cfebdd29a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java @@ -44,10 +44,10 @@ import java.util.Map; import java.util.TreeMap; /** - * Outputs the read lengths of all the reads in a file. + * Collect read length statistics * *

          - * Generates a table with the read lengths categorized per sample. If the file has no sample information + * This tool generates a table with the read lengths categorized per sample. If the file has no sample information * (no read groups) it considers all reads to come from the same sample. *

          * @@ -59,16 +59,15 @@ import java.util.TreeMap; * *

          Output

          *

          - * A human/R readable table of tab separated values with one column per sample and one row per read. + * A human/R-readable table of tab-separated values with one column per sample and one row per read. *

          * - *

          Examples

          + *

          Usage example

          *
          - *    java
          - *      -jar GenomeAnalysisTK.jar
          - *      -T ReadLengthDistribution
          - *      -I example.bam
          - *      -R reference.fasta
          + *    java -jar GenomeAnalysisTK.jar \
          + *      -T ReadLengthDistribution \
          + *      -R reference.fasta \
          + *      -I example.bam \
            *      -o example.tbl
            *  
          * diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java index c6e451cd2..8ec22e5ea 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java @@ -51,41 +51,41 @@ import java.util.Set; /** - * Generates an alternative reference sequence over the specified interval. + * Generate an alternative reference sequence over the specified interval * - *

          - * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). - * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + *

          Given a variant callset, this tool replaces the reference bases at variation sites with the bases supplied in the + * corresponding callset records. Additionally, it allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.

          * - * The output format can be partially controlled using the provided command-line arguments. + *

          The output format can be partially controlled using the provided command-line arguments. * Specify intervals with the usual -L argument to output only the reference bases within your intervals. * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a - * separate fasta sequence (named numerically in order). - * - * Several important notes: - * 1) if there are multiple variants that start at a site, it chooses one of them randomly. - * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. - * 3) this tool works only for SNPs and for simple indels (but not for things like complex substitutions). - * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). + * separate fasta sequence (named numerically in order).

          * + *

          Caveats

          + *
            + *
          • If there are multiple variants that start at a site, it chooses one of them randomly.
          • + *
          • When there are overlapping indels (but with different start positions) only the first will be chosen.
          • + *
          • This tool works only for SNPs and for simple indels (but not for things like complex substitutions).
          • + *
          + *

          Input

          *

          - * The reference, requested intervals, and any number of variant rod files. + * The reference, requested intervals, and any number of variant ROD files. *

          * *

          Output

          *

          - * A fasta file representing the requested intervals. + * A FASTA file representing the requested intervals. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T FastaAlternateReferenceMaker \
          + *   -R reference.fasta \
            *   -o output.fasta \
            *   -L input.intervals \
          - *   --variant input.vcf \
          + *   -V input.vcf \
            *   [--snpmask mask.vcf]
            * 
          * diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java index 562f00bf4..08ab3019a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java @@ -40,13 +40,13 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.io.PrintStream; /** - * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. + * Create a subset of a FASTA reference sequence * - *

          - * The output format can be partially controlled using the provided command-line arguments. - * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + *

          This tool creates a new reference in FASTA format consisting of only those positions or intervals + * provided in the input data set. The output format can be partially controlled using the provided command-line + * arguments. Specify intervals with the usual -L argument to output only the reference bases within your intervals. * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a - * separate fasta sequence (named numerically in order). + * separate fasta sequence (named numerically in order).

          * *

          Input

          *

          @@ -58,11 +58,11 @@ import java.io.PrintStream; * A fasta file representing the requested intervals. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T FastaReferenceMaker \
          + *   -R reference.fasta \
            *   -o output.fasta \
            *   -L input.intervals
            * 
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java index e5178dd74..7215e2b7b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java @@ -52,11 +52,11 @@ import java.io.PrintStream; * Base counts are written to file if an output file name is given (with -o), otherwise output to stdout. *

          * - *

          Example

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T FastaStats \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   [-o output.txt]
            * 
          */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java index 33009b5b6..42276599f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java @@ -49,11 +49,12 @@ import java.util.*; /** - * Filters variant calls using a number of user-selectable, parameterizable criteria. + * Filter variant calls based on INFO and FORMAT annotations * *

          - * VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria. - * Records are hard-filtered by changing the value in the FILTER field to something other than PASS. + * This tool is designed for hard-filtering variant calls based on certain criteria. + * Records are hard-filtered by changing the value in the FILTER field to something other than PASS. Filtered records + * will be preserved in the output unless their removal is requested in the command line.

          * *

          Input

          *

          @@ -65,11 +66,11 @@ import java.util.*; * A filtered VCF. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T VariantFiltration \
          + *   -R reference.fasta \
            *   -o output.vcf \
            *   --variant input.vcf \
            *   --filterExpression "AB < 0.2 || MQ0 > 50" \
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCMappingQualityFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCMappingQualityFilter.java
          index ce6fe0633..f82985fd0 100644
          --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCMappingQualityFilter.java
          +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCMappingQualityFilter.java
          @@ -32,7 +32,27 @@ import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
           import org.broadinstitute.gatk.engine.filters.ReadFilter;
           
           /**
          - * Filter out reads with low mapping qualities.
          + * Filter out reads with low mapping qualities for HaplotypeCaller
          + *
          + * 

          This filter is applied by default for HaplotypeCaller and is designed to ensure that only reads that are likely + * to be informative will be used in the reassembly process. It performs the same basic function as the regular + * MappingQualityFilter, but it is used at specific points in the operation of HC where it is helpful + * to be able to apply a different quality threshold from the general case.

          + * + *

          Usage example

          + * + *

          Set the HC-specific mapping quality filter to filter out reads with MAPQ < 10

          + *
          + *     java -jar GenomeAnalysisTk.jar \
          + *         -T HaplotypeCaller \
          + *         -R reference.fasta \
          + *         -I input.bam \
          + *         -o output.vcf \
          + *         -mmq 10
          + * 
          + * + *

          Note that the HCMappingQuality filter itself does not need to be specified in the command line because it is set + * automatically for HaplotypeCaller.

          * * @author mdepristo */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java index bd31f09d8..b4e586fb3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.tools.walkers.help; import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang.StringEscapeUtils; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; @@ -38,10 +39,9 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.lang.annotation.Annotation; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public class WalkerDocumentationHandler extends GenericDocumentationHandler { private final static String CMDLINE_GATK_URL = HelpConstants.GATK_DOCS_URL + "org_broadinstitute_gatk_engine_CommandLineGATK.php"; @@ -122,7 +122,15 @@ public class WalkerDocumentationHandler extends GenericDocumentationHandler { for (Method classMethod : myClass.getMethods()) { if (classMethod.toString().contains("getDescriptions") && classMethod.toString().contains("annotator")) { try { - return classMethod.invoke(instance); + String headerLine = (classMethod.invoke(instance)).toString(); + Pattern p = Pattern.compile("(INFO=<.*?>|FORMAT=<.*?>)"); + Matcher m = p.matcher(headerLine); + List annotLines = new ArrayList<>(); + while (m.find()) { + annotLines.add(StringEscapeUtils.escapeHtml(m.group())); + System.out.println("found "+m.group()); + } + return annotLines; } catch (IllegalArgumentException e) { } catch (IllegalAccessException e) { } catch (InvocationTargetException e) { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java index fd876991f..b76b9ff9c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java @@ -118,17 +118,17 @@ import java.util.Arrays; *

          * *

          Input

          - *

          A BAM file conatining your aligned sequence data and a pileup file generated by Samtools covering the region you + *

          A BAM file containing your aligned sequence data and a pileup file generated by Samtools covering the region you * want to examine.

          * *

          Output

          *

          A text file listing mismatches between the input pileup and the GATK's internal pileup. If there are no mismatches, the output file is empty.

          * - *

          Example

          + *

          Usage example

          *
            * java -jar GenomeAnalysisTK.jar \
            *   -T CheckPileup \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -I your_data.bam \
            *   --pileup:SAMPileup pileup_file.txt \
            *   -L chr1:257-275 \
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java
          index 8ba387ca5..023d103e7 100644
          --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java
          +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java
          @@ -36,7 +36,7 @@ import org.broadinstitute.gatk.utils.help.HelpConstants;
           import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
           
           /**
          - * Walks over the input data set, calculating the number of bases seen for diagnostic purposes.
          + * Count the number of bases in a set of reads
            *
            * 

          Input

          *

          @@ -45,13 +45,14 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; * *

          Output

          *

          - * Number of bases seen. + * Number of bases seen. If an output file name is provided, then the result will be written to that file. + * Otherwise it will be sent to standard console output. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
          + *   -R reference.fasta \
            *   -T CountBases \
            *   -I input.bam \
            *   [-L input.intervals]
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java
          index 096ce70dc..443196cff 100644
          --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java
          +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java
          @@ -45,7 +45,7 @@ import java.util.Collections;
           import java.util.List;
           
           /**
          - * Count contiguous regions in an interval list.
          + * Count contiguous regions in an interval list
            *
            * 

          When the GATK reads in intervals from an intervals list, any intervals that overlap each other get merged into * a single interval spanning the original ones. For example, if you have the following intervals: @@ -63,7 +63,7 @@ import java.util.List; * *

          Input

          *

          - * One or more rod files containing intervals to check. + * One or more ROD files containing intervals to check. *

          * *

          Output

          @@ -73,12 +73,12 @@ import java.util.List; * * You can use the -numOverlaps argument to find out how many cases you have of a specific number of overlaps. * - *

          Example

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CountIntervals \
          - *   -R ref.fasta \
          - *   -0 output.txt \
          + *   -R reference.fasta \
          + *   -o output.txt \
            *   -check intervals.list
            * 
          */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java index 5987199b1..5a0ec3370 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java @@ -39,10 +39,10 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.io.PrintStream; /** - * Walks over the input data set, calculating the total number of covered loci for diagnostic purposes. + * Count the total number of covered loci * *

          - * This is the simplest example of a locus walker. + * This tool counts the number of loci (positions in the reference) that are covered by sequence data. *

          * *

          Input

          @@ -56,11 +56,11 @@ import java.io.PrintStream; * Otherwise it will be sent to standard console output. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CountLoci \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -I input.bam \
            *   -o output.txt \
            *   [-L input.intervals]
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java
          index 293cfd0ed..55424f67b 100644
          --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java
          +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java
          @@ -41,7 +41,9 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
           import java.io.PrintStream;
           
           /**
          - * Walks over the input data set, calculating the number of reads seen from male samples for diagnostic purposes.
          + * Count the number of reads seen from male samples
          + *
          + * 

          This tool counts the number of sequence reads seen from samples that are male according to the sample metadata.

          * *

          Input

          *

          @@ -50,14 +52,15 @@ import java.io.PrintStream; * *

          Output

          *

          - * Number of reads seen from male samples. + * Number of reads seen from male samples. If an output file name is provided, then the result will be written to that file. + * Otherwise it will be sent to standard console output. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CountMales \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -I samples.bam \
            *   -o output.txt
            * 
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java index 3e9e9db39..c81f7b9ac 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java @@ -51,9 +51,9 @@ import java.io.PrintStream; import java.util.*; /** - * Prints out counts of the number of reference ordered data objects encountered. + * Count the number of ROD objects encountered * - *

          CountRods is a RODWalker, and so traverses the data by ROD. For example if the ROD passed to it is a VCF file, + *

          CountRods is a RODWalker, and so traverses the data by ROD (reference ordered data). For example if the ROD passed to it is a VCF file, * it will count the variants in the file.

          * *

          Note that this tool is different from CountRodsByRef which is a RefWalker, and so traverses the data by @@ -66,19 +66,19 @@ import java.util.*; * *

          Input

          *

          - * One or more rod files. + * One or more ROD files. *

          * *

          Output

          *

          - * Number of rods seen. + * Number of RODs seen. *

          * - *

          Example

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CountRODs \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -o output.txt \
            *   --rod input.vcf
            * 
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java index 8161d4387..c359bf1c3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java @@ -43,7 +43,7 @@ import java.util.Collections; import java.util.List; /** - * Prints out counts of the number of reference ordered data objects encountered along the reference. + * Count the number of ROD objects encountered along the reference * *

          CountRodsByRef is a RefWalker, and so traverses the data by position along the reference. It counts ROD * elements (such as, but not limited to, variants) found at each position or within specific intervals if you use @@ -58,19 +58,19 @@ import java.util.List; * *

          Input

          *

          - * One or more rod files. + * One or more ROD files. *

          * *

          Output

          *

          - * Number of rods seen. + * Number of RODs seen. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CountRODsByRef \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -o output.txt \
            *   --rod input.vcf
            * 
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java index ccb714b45..d5f424b6f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java @@ -45,7 +45,9 @@ import java.util.HashMap; import java.util.Map; /** - * Walks over the input data set, counting the number of read events (from the CIGAR operator) + * Count the number of read events + * + *

          This tool counts the number of "events" (I, D, M etc) encountered in the CIGAR strings of the sequence reads.

          * *

          Input

          *

          @@ -55,12 +57,13 @@ import java.util.Map; *

          Output

          *

          * Number of read events for each category, formatted as a GATKReport table. + *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CountReadEvents \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -I input.bam \
            *   -o output.grp \
            *   [-L input.intervals]
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java
          index 6503766b6..369a5878e 100644
          --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java
          +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java
          @@ -37,11 +37,12 @@ import org.broadinstitute.gatk.utils.help.HelpConstants;
           import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
           
           /**
          - * Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
          + * Count the number of reads
            *
            * 

          - * Can also count the number of reads matching a given criterion using read filters (see the - * --read-filter command line argument). Simplest example of a read-backed analysis. + * This is especially useful in combination with read filters (see the --read-filter command line argument) which + * allow you to count reads matching specific criteria (e.g. read group tags or quality parameters). + *

          * * *

          Input

          @@ -51,13 +52,13 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; * *

          Output

          *

          - * Number of reads seen. + * Number of reads seen. This is output to the terminal/stdout. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
          + *   -R reference.fasta \
            *   -T CountReads \
            *   -I input.bam \
            *   [-L input.intervals]
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java
          index 10094ac6a..b569a0a6c 100644
          --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java
          +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java
          @@ -44,7 +44,9 @@ import java.io.PrintStream;
           import java.util.List;
           
           /**
          - * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips
          + * Count the number of reads ending in insertions, deletions or soft-clips
          + *
          + * 

          This tool reports the number of reads where the end bases do not map to the reference sequence.

          * *

          Input

          *

          @@ -56,13 +58,13 @@ import java.util.List; * Number of reads ending in each category. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CountTerminusEvent \
          - *   -o output.txt \
          + *   -R reference.fasta \
            *   -I input.bam \
          + *   -o output.txt \
            *   [-L input.intervals]
            * 
          */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java index 7bd51249a..57cf4d59a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java @@ -42,9 +42,9 @@ import java.text.DecimalFormat; import java.text.NumberFormat; /** - * A reimplementation of the 'samtools flagstat' subcommand in the GATK + * Collect statistics about sequence reads based on their SAM flags * - *

          This tool walks over all input data, accumulating statistics such as total number of reads, + *

          This tool emulates the behavior of 'samtools flagstat'. It collects statistics such as total number of reads, * reads with QC failure flag set, number of duplicates, percentage mapped, etc.

          * *

          Input

          @@ -57,11 +57,11 @@ import java.text.NumberFormat; * Resulting stats are written to file if an output file name is given (with -o), otherwise output to stdout. *

          * - *

          Example

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T FlagStat \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -I reads.bam \
            *   [-o output.txt]
            * 
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java index 8b59812bb..db6199951 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java @@ -46,9 +46,10 @@ import java.util.Collections; import java.util.List; /** - * Emulates the samtools pileup command to print aligned reads + * Print read alignments in Pileup-style format * - *

          Prints the alignment in something similar to the Samtools pileup format (see the + *

          This tool emulates the 'samtools pileup' command. It prints the alignment in a format that is very similar to + * the Samtools pileup format (see the * Pileup format documentation for more details about * the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference * base, read bases, and read qualities. In addition to these default fields, additional information can be added to @@ -58,7 +59,6 @@ import java.util.List; *

            *  samtools pileup -f in.ref.fasta -l in.site_list input.bam
            * 
          - * *

          Input

          *

          @@ -70,12 +70,12 @@ import java.util.List; * Alignment of reads formatted in the Pileup style. *

          * - *

          Example

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T Pileup \
          - *   -R exampleFASTA.fasta \
          - *   -I exampleBAM.bam \
          + *   -R reference.fasta \
          + *   -I my_reads.bam \
            *   -L chr1:257-267
            *   -o output.txt
            * 
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java index ca9a76ab8..9c71f0934 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java @@ -40,8 +40,11 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.io.PrintStream; /** - * Prints out all of the RODs in the input data set. Data is rendered using the toString() method - * of the given ROD. + * Print out all of the RODs in the input data set + * + *

          This tool reports what RODs (reference ordered data sets) are contained in a given input.

          + * + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class PrintRODs extends RodWalker { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java index 0b325e6a4..0a0b9b6cd 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java @@ -51,14 +51,15 @@ import java.io.PrintStream; * *

          Output

          *

          - * If ok, nothing, else will throw an exception at the site where there's been a problem + * If the reference is fully valid, the run will complete successfully. If not, an error message will be produced + * at the site where the program encountered a problem. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          - *   -T QCRef
          + * java -jar GenomeAnalysisTK.jar \
          + *   -T QCRef \
          + *   -R reference.fasta
            * 
          * */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java index 23ea65b9d..a2c3f796e 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java @@ -48,23 +48,21 @@ import java.io.PrintStream; import java.util.Arrays; /** - * Read clipping statistics for all reads. + * Collect read clipping statistics * - * Walks over the input reads, printing out statistics about the read length, number of clipping events, and length - * of the clipping to the output stream. - * - * Note: Ignores N's in the Cigar string. + *

          This tool collects statistics about the read length, number of clipping events, and length + * of the clipping in all reads in the dataset.

          * *

          Input

          - * One or more BAM files + * One or more BAM files. * *

          Output

          - * A simple tabulated text file with read length and clipping statistics for every read (or every N reads if the "skip" - * option is used) + * A simple tabulated text file with read length and clipping statistics for every read (or every given number of reads + * if the "skip" option is used). + * + *

          Caveat

          + *

          This tool ignores "N" events in the CIGAR string.

          * - * User: depristo - * Date: May 5, 2010 - * Time: 12:16:41 PM */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java index 1bbc3a2d6..cafaa82c5 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java @@ -111,7 +111,7 @@ import java.util.regex.Pattern; *
          *

          * - *

          Example

          + *

          Usage example

          *
            *   java -jar GenomeAnalysisTK.jar \
            *     -T ClipReads \
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java
          index 008a14842..2f609facf 100644
          --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java
          +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java
          @@ -52,17 +52,18 @@ import java.io.File;
           import java.util.*;
           
           /**
          - * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
          + * Write out sequence read data (for filtering, merging, subsetting etc)
            *
            * 

          - * PrintReads can dynamically merge the contents of multiple input BAM files, resulting - * in merged output sorted in coordinate order. Can also optionally filter reads based on the - * --read_filter command line argument. + * PrintReads is a generic utility tool for manipulating sequencing data in SAM/BAM format. It can dynamically + * merge the contents of multiple input BAM files, resulting in merged output sorted in coordinate order. It can + * also optionally filter reads based on various read properties such as read group tags using the `--read_filter/-rf` + * command line argument (see documentation on read filters for more information). *

          * *

          * Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow, - * it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below. + * it takes the `--BQSR` engine argument, which is listed under Inherited Arguments > CommandLineGATK below. *

          * *

          Input

          @@ -75,30 +76,31 @@ import java.util.*; * A single processed bam file. *

          * - *

          Examples

          + *

          Usage examples

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + // Prints all reads that have a mapping quality above zero
          + * java -jar GenomeAnalysisTK.jar \
            *   -T PrintReads \
          - *   -o output.bam \
          + *   -R reference.fasta \
            *   -I input1.bam \
            *   -I input2.bam \
          + *   -o output.bam \
            *   --read_filter MappingQualityZero
            *
            * // Prints the first 2000 reads in the BAM file
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T PrintReads \
          - *   -o output.bam \
          + *   -R reference.fasta \
            *   -I input.bam \
          + *   -o output.bam \
            *   -n 2000
            *
            * // Downsamples BAM file to 25%
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T PrintReads \
          - *   -o output.bam \
          + *   -R reference.fasta \
            *   -I input.bam \
          + *   -o output.bam \
            *   -dfrac 0.25
            * 
          * @@ -142,11 +144,11 @@ public class PrintReads extends ReadWalker impleme /** * Erase all extra attributes in the read but keep the read group information */ - @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false) + @Argument(fullName="simplify", shortName="s", doc="Simplify all reads", required=false) public boolean simplifyReads = false; @Hidden - @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Don't output a program tag", required = false) public boolean NO_PG_TAG = false; List readTransformers = Collections.emptyList(); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java index 7e05a10c4..0e23fac95 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java @@ -50,47 +50,49 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** - * Utility tool to blindly strip base adaptors. Main application is for FASTQ/unaligned BAM pre-processing where libraries - * have very short inserts, and hence a substantial part of the sequencing data will have adaptor sequence present. - *

          - * By design, tool will only work for Illumina-like library constructs, where the typical library architecture is: - * [Adaptor 1]-[Genomic Insert]-[Adaptor 2 (index/barcode)] - *

          - * It is assumed that when data is paired, one read will span the forward strand and one read will span the reverse strand. - * Hence, when specifying adaptors they should be specified as both forward and reverse-complement to make sure they're removed in all cases. + * Utility tool to blindly strip base adaptors + * + *

          This tool is mainly intended to be applied to FASTQ/unaligned BAM pre-processing where libraries + * have very short inserts, and hence a substantial part of the sequencing data will have adaptor sequence present. By + * design, tool will only work for Illumina-like library constructs, where the typical library architecture is: + * [Adaptor 1]-[Genomic Insert]-[Adaptor 2 (index/barcode)]

          + * + *

          We assume that when data is paired, one read will span the forward strand and one read will span the reverse strand. + * Hence, adaptors should be specified as both forward and reverse-complement to ensure they are removed in all cases. * By design, as well, "circular" constructions where a read can have an insert, then adaptor, then more genomic insert, are not supported. * When an adaptor is detected, all bases downstream from it (i.e. in the 3' direction) will be removed. * Adaptor detection is carried out by looking for overlaps between forward and reverse reads in a pair. * If a sufficiently high overlap is found, the insert size is computed and if insert size < read lengths adaptor bases are removed from reads. + *

          * - * Advantages over ReadClipper: - * - No previous knowledge of adaptors or library structure is necessary + *

          Advantage over ReadClipper: No previous knowledge of adaptors or library structure is necessary.

          * - * Advantages over 3rd party tools like SeqPrep: - * - Can do BAM streaming instead of having to convert to fastq - * - No need to merge reads - merging reads can have some advantages, but complicates downstream processing and loses information that can be used, - * e.g. in variant calling - *

          + *

          Advantages over 3rd party tools like SeqPrep:

          + *
            + *
          • Can do BAM streaming instead of having to convert to fastq
          • + *
          • No need to merge reads; merging reads can have some advantages, but complicates downstream processing and loses information that can be used, + * e.g. in variant calling
          • + *
          * - *

          Input

          + *

          Input

          *

          * The input read data in BAM format. Read data MUST be in query name ordering as produced, for example with Picard's FastqToBam + *

          * - *

          Output

          + *

          Output

          *

          * A merged BAM file with unaligned reads *

          * - *

          Examples

          + * *
          - * java -Xmx4g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
          + *   -R reference.fasta \
            *   -T ReadAdaptorTrimmer \
            *   -I my_reads.bam \
          - *   -R resources/Homo_sapiens_assembly18.fasta \
            *   -o trimmed_Reads.bam
            * 
          */ - @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.READ) public class ReadAdaptorTrimmer extends ReadWalker, SAMFileWriter> implements NanoSchedulable { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java index 500ce20d2..b015e6dc8 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java @@ -51,8 +51,29 @@ import java.util.List; import java.util.Map; /** - * Divides the input data set into separate BAM files, one for each sample in the input data set. The split - * files are named concatenating the sample name to the end of the provided outputRoot command-line argument. + * Split a BAM file by sample + * + *

          This tool divides the input data set into separate BAM files, one for each sample in the input data set. The split + * files are named by concatenating the sample name to the end of the provided outputRoot command-line argument.

          + * + *

          Input

          + *

          + * A single bam file. + *

          + * + *

          Output

          + *

          + * A separate bam file for each sample. + *

          + * + *

          Usage example

          + *
          + * java -jar GenomeAnalysisTK.jar \
          + *   -T SplitSamFile \
          + *   -R reference.fasta \
          + *   -I input.bam \
          + *   --outputRoot myproject_
          + * 
          */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @WalkerName("SplitSamFile") diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java index d47170514..9812d1001 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.tools.walkers.rnaseq; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter; import org.broadinstitute.gatk.engine.walkers.DisabledReadFilters; import org.broadinstitute.gatk.engine.walkers.Downsample; @@ -36,6 +37,8 @@ import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.downsampling.DownsampleType; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; @@ -44,33 +47,21 @@ import java.io.PrintStream; import java.util.List; /** - * Calculate allele counts for allele-specific expression analysis + * Calculate read counts per allele for allele-specific expression analysis * *

          - * This tool calculates allele counts at a set of given loci after applying filters that are tuned for enabling + * This tool calculates allele counts at a set of positions after applying filters that are tuned for enabling * allele-specific expression (ASE) analysis. The filters operate on mapping quality, base quality, depth of coverage, * overlapping paired reads and deletions overlapping the position. All thresholds and options are controlled by * command-line arguments. *

          * - *

          Notes

          - *
            - *
          • Like most GATK tools, this tools filters out duplicate reads by default. However, some ASE methods - * recommend including duplicate reads in the analysis, so the DuplicateReads filter can be disabled using the - * `-drf DuplicateReads` flag in the command-line.
          • - *
          - *

          Caveats

          - *
            - *
          • This tool will only process biallelic sites. If your callset contains multiallelic sites, they will be ignored. - * Optionally, you can subset your callset to just biallelic variants using e.g. - * SelectVariants - * with the option `-restrictAllelesTo BIALLELIC`.
          • - *
          *

          Input

          *
            *
          • BAM files (with proper headers) to be analyzed for ASE
          • *
          • A VCF file with specific sites to process.
          • - *

            + *
          + * *

          Output

          *

          * A table of allele counts at the given sites. By default, it is formatted as a tab-delimited text file @@ -78,12 +69,12 @@ import java.util.List; * a downstream tool developed for allele-specific expression analysis. *

          * - *

          Examples

          + *

          Usage example

          *
            * java -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -T ASEReadCounter \
          - *   -o file_name \
          + *   -o file_name.csv \
            *   -I input.bam \
            *   -sites sites.vcf \
            *   -U ALLOW_N_CIGAR_READS \
          @@ -92,7 +83,23 @@ import java.util.List;
            *   [--minBaseQuality 2] \
            *   [-drf DuplicateRead]
            * 
          + * + *

          Note

          + *
            + *
          • Like most GATK tools, this tools filters out duplicate reads by default. However, some ASE methods + * recommend including duplicate reads in the analysis, so the DuplicateReads filter can be disabled using the + * "-drf DuplicateReads" flag in the command-line.
          • + *
          + *

          Caveat

          + *
            + *
          • This tool will only process biallelic sites. If your callset contains multiallelic sites, they will be ignored. + * Optionally, you can subset your callset to just biallelic variants using e.g. + * SelectVariants + * with the option "-restrictAllelesTo BIALLELIC".
          • + *
          + * */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 10000) //@DisabledReadFilters({DuplicateReadFilter.class}) //currently can be disabled using the command line argument -drf DuplicateRead public class ASEReadCounter extends LocusWalker { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java index 38027af47..dea80d112 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java @@ -76,6 +76,7 @@ import java.util.*; * degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation * and stratification modules. + *

          * *

          Input

          *

          @@ -86,8 +87,9 @@ import java.util.*; *

          * Evaluation tables detailing the results of the eval modules which were applied. * For example: + *

          *
          - * output.eval.gatkreport:
          + * output.eval.grp:
            * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
            * CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
            * CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
          @@ -103,12 +105,12 @@ import java.util.*;
            * 
          *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T VariantEval \
          - *   -o output.eval.gatkreport \
          + *   -R reference.fasta \
          + *   -o output.eval.grp \
            *   --eval:set1 set1.vcf \
            *   --eval:set2 set2.vcf \
            *   [--comp comp.vcf]
          @@ -116,9 +118,11 @@ import java.util.*;
            *
            * 

          Caveat

          * - *

          Some stratifications and evaluators are incompatible with each other due to their respective memory requirements, such as AlleleCount and VariantSummary, or Sample and VariantSummary. - * If you specify such a combination, the program will output an error message and ask you to disable one of these options. - * We do not currently provide an exhaustive list of incompatible combinations, so we recommend trying out combinations that you are interested in on a dummy command line, to rapidly ascertain whether it will work or not.

          + *

          Some stratifications and evaluators are incompatible with each other due to their respective memory requirements, + * such as AlleleCount and VariantSummary, or Sample and VariantSummary. If you specify such a combination, the program + * will output an error message and ask you to disable one of these options. We do not currently provide an exhaustive + * list of incompatible combinations, so we recommend trying out combinations that you are interested in on a dummy + * command line, to rapidly ascertain whether it will work or not.

          * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java index 51f0c40bc..b538225ef 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java @@ -89,23 +89,24 @@ import java.util.*; * A combined VCF. *

          * - *

          Examples

          + *

          Usage examples

          *   *

          Merge two separate callsets

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CombineVariants \
          + *   -R reference.fasta \
            *   --variant input1.vcf \
            *   --variant input2.vcf \
            *   -o output.vcf \
            *   -genotypeMergeOptions UNIQUIFY
            * 
          + * *

          Get the union of calls made on the same samples

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T CombineVariants \
          + *   -R reference.fasta \
            *   --variant:foo input1.vcf \
            *   --variant:bar input2.vcf \
            *   -o output.vcf \
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java
          index 6e8a9e7e9..d04e14ceb 100644
          --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java
          +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java
          @@ -47,13 +47,36 @@ import htsjdk.variant.variantcontext.VariantContext;
           import java.util.*;
           
           /**
          - * Filters a lifted-over VCF file for ref bases that have been changed.
          + * Filters a lifted-over VCF file for reference bases that have been changed
            *
          - * "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process adjusts the position of the call to match the corresponding position on the target reference.
          - * For example, if you have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on the b37 reference, you need to liftover one of the callsets to the other reference.
          + * 

          "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process + * adjusts the position of the call to match the corresponding position on the target reference. For example, if you + * have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on + * the b37 reference, you need to liftover one of the callsets to the other reference.

          * - * FilteredLiftedVariants is intended to be the second of two processing steps for the liftover process. The first step is to run LiftoverVariants on your VCF file. - * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced. + *

          This tool is intended to be the second of two processing steps for the liftover process. The first step is to + * run LiftoverVariants on your VCF file. The second step is to run FilterLiftedVariants on the output of + * LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the + * header have all been correctly replaced.

          + * + *

          Input

          + *

          + * A lifted-over variant call set to filter. + *

          + * + *

          Output

          + *

          + * The filtered call set. + *

          + * + *

          Usage example

          + *
          + * java -jar GenomeAnalysisTK.jar \
          + *   -T FilterLiftedVariants \
          + *   -R reference.fasta \
          + *   -V liftedover_input.vcf \
          + *   -o filtered_output.vcf
          + * 
          * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java index a9e578058..5597a4c67 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java @@ -44,10 +44,10 @@ import java.io.PrintStream; import java.util.*; /** - * Genotype concordance (per-sample and aggregate counts and frequencies, NRD/NRS and site allele overlaps) between two callsets + * Genotype concordance between two callsets * *

          - * GenotypeConcordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, + * This tool takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, * and for each sample, the genotype-by-genotype counts (e.g. the number of sites at which a sample was * called homozygous-reference in the EVAL callset, but homozygous-variant in the COMP callset). It outputs these * counts as well as convenient proportions (such as the proportion of het calls in the EVAL which were called REF in @@ -192,7 +192,17 @@ import java.util.*; * NA12891 NO_CALL_HOM_VAR 0.000 * (...) *

          - + * + *

          Usage example

          + *
          + * java -jar GenomeAnalysisTK.jar \
          + *   -T GenotypeConcordance \
          + *   -R reference.fasta \
          + *   -eval test_set.vcf \
          + *   -comp truth_set.vcf \
          + *   -o output.grp
          + * 
          + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java index b80c1c4d9..099293cc2 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -57,18 +57,18 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; import java.util.*; /** - * Left-aligns indels from a variants file. + * Left-align indels in a variant callset * *

          * LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. - * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation. + * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation.

          * *

          Input

          *

          - * A variant set to left-align and trim. + * A variant call set to left-align and trim. *

          * *

          Output

          @@ -76,11 +76,11 @@ import java.util.*; * A left-aligned VCF. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T LeftAlignAndTrimVariants \
          + *   -R reference.fasta \
            *   --variant input.vcf \
            *   -o output.vcf
            * 
          diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java index f66daf254..15981d19a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java @@ -58,16 +58,43 @@ import java.io.File; import java.util.*; /** - * Lifts a VCF file over from one build to another. + * Lifts a VCF file over from one build to another * - * "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process adjusts the position of the call to match the corresponding position on the target reference. - * For example, if you have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on the b37 reference, you need to liftover one of the callsets to the other reference. + *

          "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process + * adjusts the position of the call to match the corresponding position on the target reference. For example, if you + * have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on + * the b37 reference, you need to liftover one of the callsets to the other reference.

          * - * LiftoverVariants is intended to be the first of two processing steps for the liftover process. - * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced. + *

          LiftoverVariants is intended to be the first of two processing steps for the liftover process. + * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid + * well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced.

          + * + *

          Caveat

          + *

          To be clear, the VCF resulting from the LiftoverVariants run is not guaranteed to be valid according to the official specification. The file could + * possibly be mis-sorted and the header may not be complete. That is why you need to run FilterLiftedVariants on it.

          + * + *

          Input

          + *

          + * A variant call set to lift over, the sequence dictionary of the new reference build and the appropriate liftover + * chain file. + *

          + * + *

          Output

          + *

          + * The lifted-over call set. + *

          + * + *

          Usage example

          + *
          + * java -jar GenomeAnalysisTK.jar \
          + *   -T LiftoverVariants \
          + *   -R reference_hg19.fasta \
          + *   -V input_hg19.vcf \
          + *   -chain liftover_hg19_to_b37.txt \
          + *   -dict reference_b37.dict \
          + *   -o liftedover_output_b37.vcf
          + * 
          * - * To be clear, the VCF resulting from the LiftoverVariants run is not guaranteed to be valid according to the official specification. The file could - * possibly be mis-sorted and the header may not be complete. That is why you need to run FilterLiftedVariants on it. */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class LiftoverVariants extends RodWalker { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java index b14d2f5b3..7b08bef53 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java @@ -50,7 +50,33 @@ import java.io.File; import java.util.*; /** - * Takes a VCF file, randomly splits variants into two different sets, and outputs 2 new VCFs with the results. + * Randomly split variants into different sets + * + *

          This tool takes a VCF file, randomly splits variants into different sets, and writes the + * results to separate files. By default the tool splits the input into two new sets, but it can be made to output + * more than two separate call sets.

          + * + *

          Input

          + *

          + * A variant call set to split. + *

          + * + *

          Output

          + *

          + * The new callsets. + *

          + * + *

          Usage example

          + *
          + * java -jar GenomeAnalysisTK.jar \
          + *   -T RandomlySplitVariants \
          + *   -R reference.fasta \
          + *   -V input.vcf \
          + *   -o1 output_1.vcf \
          + *   -o2 output_2.vcf
          + * 
          + * + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class RandomlySplitVariants extends RodWalker { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java index 75f297c10..a55c2215e 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java @@ -51,8 +51,8 @@ import java.io.File; import java.util.*; /** - * Selects headers from a VCF source. - *

          + * Selects headers from a VCF source + * *

          * Often, a VCF containing many headers will need to be subset in order to facilitate certain formatting guidelines. * SelectHeaders can be used for this purpose. Given a single VCF file, one or more headers can be extracted from the @@ -65,44 +65,49 @@ import java.util.*; *

          *

          Output

          *

          - * A header selected VCF. + * A VCF with the selected headers. *

          - *

          - *

          Examples

          + * + *

          Usage examples

          + *

          Select only the FILTER, FORMAT, and INFO headers

          *
          - * Select only the FILTER, FORMAT, and INFO headers:
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectHeaders \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -hn FILTER \
            *   -hn FORMAT \
            *   -hn INFO
          + * 
          * - * Select only the FILTER, FORMAT, and INFO headers and add in the reference file names: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select only the FILTER, FORMAT, and INFO headers and add in the reference file names

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectHeaders \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -hn FILTER \
            *   -hn FORMAT \
            *   -hn INFO \
            *   -irn \
            *   -iln
          + * 
          * - * Select only the FILTER, FORMAT, and INFO headers, plus any headers with SnpEff: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select only the FILTER, FORMAT, and INFO headers, plus any headers with "SnpEff"

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectHeaders \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -hn FILTER \
            *   -hn FORMAT \
            *   -hn INFO \
            *   -he '.*SnpEff.*'
            * 
          + * */ @SuppressWarnings("unused") @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 7658f042c..9b9738164 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -86,121 +86,141 @@ import java.util.*; * *

          Output

          *

          - * The name of the VCF file to which to write the selected subset of variants. + * A new VCF file containing the selected subset of variants. *

          * - *

          Examples

          + *

          Usage examples

          + *

          Select two samples out of a VCF with many samples

          *
          - * Select two samples out of a VCF with many samples:
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -sn SAMPLE_A_PARC \
            *   -sn SAMPLE_B_ACTG
          + * 
          * - * Select two samples and any sample that matches a regular expression: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select two samples and any sample that matches a regular expression

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -sn SAMPLE_1_PARC \
            *   -sn SAMPLE_1_ACTG \
            *   -se 'SAMPLE.+PARC'
          + * 
          * - * Select any sample that matches a regular expression and sites where the QD annotation is more than 10: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select any sample that matches a regular expression and sites where the QD annotation is more than 10

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -se 'SAMPLE.+PARC' \
            *   -select "QD > 10.0"
          + * 
          * - * Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default): - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default)

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -sn SAMPLE_1_ACTG \
            *   -env \
            *   -ef
          + * 
          * - * Select a sample, subset remaining alleles, but don't trim: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select a sample, subset remaining alleles, but don't trim

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -sn SAMPLE_1_ACTG \
            *   -env \
            *   -noTrim
          + *
          * - * Select a sample and restrict the output vcf to a set of intervals: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select a sample and restrict the output vcf to a set of intervals

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -L /path/to/my.interval_list \
            *   -sn SAMPLE_1_ACTG
          + * 
          * - * Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset): - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset)

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant hapmap.vcf \
          + *   -R reference.fasta \
          + *   -V hapmap.vcf \
            *   --discordance myCalls.vcf \
            *   -o output.vcf \
            *   -sn mySample
          + * 
          * - * Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers): - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers)

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant myCalls.vcf \
          - *   --concordance hisCalls.vcf \
          + *   -R reference.fasta \
          + *   -V myCalls.vcf \
          + *   --concordance theirCalls.vcf \
            *   -o output.vcf \
            *   -sn mySample
          + * 
          * - * Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -ped family.ped \
          - *   -mv \
          - *   -mvq 50 \
          + *   -mv -mvq 50 \
            *   -o violations.vcf
          + * 
          * - * Creating a set with 50% of the total number of variants in the variant VCF: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Create a set with 50% of the total number of variants in the variant VCF

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -fraction 0.5
          + * 
          * - * Select only indels from a VCF: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select only indels from a VCF

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -selectType INDEL
          + * 
          * - * Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column): - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

          Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column)

          + *
          + * java -jar GenomeAnalysisTK.jar \
            *   -T SelectVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf \
            *   -selectType SNP -selectType MNP \
            *   -restrictAllelesTo MULTIALLELIC
          - *
            * 
          * */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java index 6142bc08d..82a201091 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java @@ -49,10 +49,10 @@ import java.util.*; /** - * Validates a VCF file with an extra strict set of criteria. + * Validate a VCF file with an extra strict set of criteria * *

          - * ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it. + * This tool is designed to validate much of the information inside a VCF file. * In addition to standard adherence to the VCF specification, this tool performs extra strict validations to ensure * the information contained within the file is correct. These include: *

          @@ -80,37 +80,33 @@ import java.util.*; * A variant set to validate using -V or --variant as shown below. *

          * - *

          Examples

          - * - *

          To perform VCF format and all strict validations:

          + *

          Usage examples

          * + *

          To perform VCF format tests and all strict validations

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T ValidateVariants \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   --dbsnp dbsnp.vcf
            * 
          * - *

          To perform only VCF format tests:

          - * + *

          To perform only VCF format tests

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T ValidateVariants \
          - *   --validationTypeToExclude ALL \
          - *   --variant input.vcf
          + *   -R reference.fasta \
          + *   -V input.vcf \
          + *   --validationTypeToExclude ALL
            * 
          * - *

          To perform all validations except the strict ALLELE validation:

          - * + *

          To perform all validations except the strict ALLELE validation

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T ValidateVariants \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   --validationTypeToExclude ALLELES
          - *   --variant input.vcf \
          - *   --dbsnp dbsnp.vcf
            * 
          * */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java index c52c408a2..307b78289 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java @@ -49,13 +49,13 @@ import htsjdk.variant.variantcontext.VariantContextBuilder; import java.util.*; /** - * Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes) + * Annotate a validation VCF with QC metrics * *

          - * The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes). + * This tool is intended for vetting/assessing validation data (containing genotypes). * The tool produces a VCF that is annotated with information pertaining to plate quality control and by * default is soft-filtered by high no-call rate or low Hardy-Weinberg probability. - * If you have .ped files, please first convert them to VCF format. + * If you have .ped files, please first convert them to VCF format.

          * *

          Input

          *

          @@ -65,6 +65,7 @@ import java.util.*; *

          Output

          *

          * An annotated VCF. Additionally, a table like the following will be output: + *

          *
            *     Total number of samples assayed:                  185
            *     Total number of records processed:                152
          @@ -74,14 +75,13 @@ import java.util.*;
            *     Number of records passing all filters:            106 (69%)
            *     Number of passing records that are polymorphic:   98 (92%)
            * 
          - *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T VariantValidationAssessor \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf
            * 
          * diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java index b9954221f..0873d5b94 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java @@ -48,35 +48,37 @@ import htsjdk.variant.vcf.VCFHeaderLine; import java.util.*; /** - * Takes alleles from a variants file and breaks them up (if possible) into more basic/primitive alleles. + * Simplify multi-nucleotide variants (MNPs) into more basic/primitive alleles. * - *

          - * For now this tool modifies only multi-nucleotide polymorphisms (MNPs) and leaves SNPs, indels, and complex substitutions as is, - * although one day it may be extended to handle the complex substitution case. + *

          This tool will take an MNP (e.g. ACCCA -> TCCCG) and break it up into separate records for each component + * part (A-T and A->G).

          * - * This tool will take an MNP (e.g. ACCCA -> TCCCG) and break it up into separate records for each component part (A-T and A->G). - * - * Note that this tool modifies only bi-allelic variants. - * - *

          Input

          + *

          Input

          *

          * A variant set with any type of alleles. *

          * - *

          Output

          + *

          Output

          *

          * A VCF with alleles broken into primitive types. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T VariantsToAllelicPrimitives \
          - *   --variant input.vcf \
          + *   -R reference.fasta \
          + *   -V input.vcf \
            *   -o output.vcf
            * 
          * + *

          Caveats

          + *
            + *
          • For now this tool modifies only multi-nucleotide polymorphisms (MNPs) and leaves SNPs, indels, and + * complex substitutions as is, although one day it may be extended to handle the complex substitution case.
          • + *
          • This tool modifies only bi-allelic variants.
          • + *
          + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class VariantsToAllelicPrimitives extends RodWalker { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java index d7090235d..b57f187b9 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java @@ -50,7 +50,32 @@ import java.io.*; import java.util.*; /** - * Converts a VCF file to a binary plink Ped file (.bed/.bim/.fam) + * Convert VCF to binary pedigree file + * + *

          This tool takes a VCF and produces a binary pedigree as used by + * PLINK, consisting of three associated files (.bed/.bim/.fam).

          + * + *

          Inputs

          + *

          + * A VCF file and a metadata file + *

          + * + *

          Outputs

          + *

          + * A binary pedigree in PLINK format, composed of three files (.bed/.bim/.fam) + *

          + * + *

          Example

          + *
          + * java -jar GenomeAnalysisTK.jar \
          + *   -T VariantsToBinaryPed \
          + *   -R reference.fasta \
          + *   -V variants.vcf \
          + *   -m metadata.fam \
          + *   -bed output.bed \
          + *   -bim output.bim \
          + *   -fam output.fam
          + * 
          */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=100)) @@ -62,37 +87,35 @@ public class VariantsToBinaryPed extends RodWalker { protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); /** - * The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This - * is what Plink describes as a fam file. An example fam file is (note that there is no header): - *

          - * CEUTrio NA12878 NA12891 NA12892 2 -9

          - * CEUTrio NA12891 UNKN1 UNKN2 2 -9

          - * CEUTrio NA12892 UNKN3 UNKN4 1 -9

          - *

          - * where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex) + *

          The metaData file can take two formats, the first of which is the first 6 lines of the standard pedigree file. This + * is what Plink describes as a .fam file. An example .fam file is as follows (note that there is no header):

          + *
          +     * CEUTrio NA12878 NA12891 NA12892 2 -9
          +     * CEUTrio NA12891 UNKN1 UNKN2 2 -9
          +     * CEUTrio NA12892 UNKN3 UNKN4 1 -9
          +     * 
          + *

          where the entries are: FamilyID IndividualID DadID MomID Phenotype Sex.

          + *

          An alternate format is a two-column key-value file:

          + *
          +     * NA12878        fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9
          +     * NA12891        fid=CEUTrio;sex=2;phenotype=-9
          +     * NA12892        fid=CEUTrio;sex=1;phenotype=-9
          +     * 
          + *

          where unknown parents do not need to be specified. The columns are the individual ID and a list of key-value pairs.

          *

          - * An alternate format is a two-column key-value file - *

          - * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9

          - * NA12891 fid=CEUTrio;sex=2;phenotype=-9

          - * NA12892 fid=CEUTrio;sex=1;phenotype=-9

          - *

          - * wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs. - *

          - * Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the - * command line has "-md [name].fam", the fam file will be subset and reordered to match the sample content and ordering - * of the VCF. However, if a metadata file of the alternate format is passed by "-md [name].txt", the walker will + * Regardless of which file is specified, the tool will output a .fam file alongside the pedigree file. If the + * command line has "-m [name].fam", the fam file will be subset and reordered to match the sample content and ordering + * of the VCF. However, if a metadata file of the alternate format is passed by "-m [name].txt", the tool will * construct a formatted .fam file from the data. *

          */ - @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + - "(in which case it will be copied to the file you provide as fam output).") + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file") File metaDataFile; @Input(shortName="mode",fullName="outputMode",required=false,doc="The output file mode (SNP major or individual major)") OutputMode mode = OutputMode.INDIVIDUAL_MAJOR; - @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") + @Output(shortName="bed",fullName = "bed",required=true,doc="output bed file") PrintStream outBed; @Output(shortName="bim",fullName="bim",required=true,doc="output map file") @@ -208,8 +231,8 @@ public class VariantsToBinaryPed extends RodWalker { try { validateVariantSite(vc,ref,context); } catch (TribbleException e) { - throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+ - "Please run ValidateVariants for more detailed information. This error is: "+e.getMessage()); + throw new UserException("Input VCF file is invalid. "+ + "Please run ValidateVariants for more detailed information. The error is: "+e.getMessage()); } String refOut; @@ -461,7 +484,7 @@ public class VariantsToBinaryPed extends RodWalker { for ( String line : new XReadLines(metaDataFile) ) { String[] famSplit = line.split("\\s+"); if ( famSplit.length != 6 ) { - throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line); + throw new UserException("Line of the fam file is malformed. Expected 6 entries. Line is "+line); } String sid = famSplit[1]; String fid = famSplit[0]; @@ -501,7 +524,7 @@ public class VariantsToBinaryPed extends RodWalker { private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) { final Allele reportedRefAllele = vc.getReference(); final int refLength = reportedRefAllele.length(); - if ( refLength > 100 ) { + if ( refLength > 100 ) { //TODO: get rid of this hardcoded limit? logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart())); return; } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java index 081403f35..bd228f323 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java @@ -48,11 +48,13 @@ import java.lang.reflect.Array; import java.util.*; /** - * Emits specific fields from a VCF file to a tab-deliminated table + * Extract specific fields from a VCF file to a tab-delimited table * *

          - * This walker accepts a single VCF file and writes out user-selected fields from the - * VCF as a header-containing, tab-deliminated file. The user specifies one or more + * This tool is designed to extract fields from the VCF to a table format that is more convenient to work with in + * downstream analyses.

          + * + *

          The user specifies one or more * fields to print with the -F NAME, each of which appears as a single column in * the output file, with a header named NAME, and the value of this field in the VCF * one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding @@ -62,9 +64,7 @@ import java.util.*; * genotypes), NO-CALL (count of no-call genotypes), TYPE (the type of event), VAR (count of * non-reference genotypes), NSAMPLES (number of samples), NCALLED (number of called samples), * GQ (from the genotype field; works only for a file with a single sample), and MULTI-ALLELIC - * (is the record from a multi-allelic site). Note that if a VCF record is missing a value, then the tool by - * default throws an error, but the special value NA can be emitted instead with - * appropriate tool arguments. + * (is the record from a multi-allelic site).

          * *

          * @@ -81,7 +81,7 @@ import java.util.*; * A tab-delimited file containing the values of the requested fields in the VCF file *

          * - *

          Examples

          + *

          Usage example

          *
            *     java -jar GenomeAnalysisTK.jar \
            *     -R reference.fasta
          @@ -89,15 +89,19 @@ import java.util.*;
            *     -V file.vcf \
            *     -F CHROM -F POS -F ID -F QUAL -F AC \
            *     -o results.table
          - *
          - *     would produce a file that looks like:
          - *
          + * 
          + *

          would produce a file that looks like:

          + *
            *     CHROM    POS ID      QUAL    AC
            *     1        10  .       50      1
            *     1        20  rs10    99      10
            *     et cetera...
            * 
          * + *

          Caveat

          + *

          If a VCF record is missing a value, then the tool by default throws an error, but the special value NA can + * be emitted instead if requested at the command line using --allowMissingData.

          + * * @author Mark DePristo * @since 2010 */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java index 2e5b9a7b7..f2386d088 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java @@ -58,14 +58,15 @@ import java.io.File; import java.util.*; /** - * Converts variants from other file formats to VCF format. + * Convert variants from other file formats to VCF format * *

          - * Note that there must be a Tribble feature/codec for the file format as well as an adaptor. + * Note that there must be a Tribble feature/codec available for the file format as well as an adaptor. + *

          * *

          Input

          *

          - * A variant file to filter. + * A variant file to convert. *

          * *

          Output

          @@ -73,14 +74,13 @@ import java.util.*; * A VCF file. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx2g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T VariantsToVCF \
          + *   -R reference.fasta \
            *   -o output.vcf \
          - *   --variant:RawHapMap input.hapmap \
          - *   --dbsnp dbsnp.vcf
          + *   --variant:RawHapMap input.hapmap
            * 
          * */ diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java index 108eb102f..dc71d58ee 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java @@ -47,7 +47,7 @@ import java.util.List; * A generic engine for comparing tree-structured objects * *

          - * Compares two record-oriented files, itemizing specific difference between equivalent + * This tool compares two record-oriented files, itemizing specific difference between equivalent * records in the two files. Reports both itemized and summarized differences. *

          * @@ -56,8 +56,8 @@ import java.util.List; *

          * The GATK contains a summarizing difference engine that compares hierarchical data structures to emit: *

            - *
          • A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G. - *
          • A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed. + *
          • A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G.
          • + *
          • A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed.
          • *
          *

          * diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java index b72811c00..e84108973 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java @@ -28,7 +28,7 @@ package org.broadinstitute.gatk.utils.help; public class HelpConstants { public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; - public final static String GATK_DOCS_URL = BASE_GATK_URL + "/tooldocs/"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/guide/tooldocs/"; public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; diff --git a/settings/helpTemplates/generic.index.template.html b/settings/helpTemplates/generic.index.template.html index bd5742f36..794e50dc6 100644 --- a/settings/helpTemplates/generic.index.template.html +++ b/settings/helpTemplates/generic.index.template.html @@ -24,7 +24,7 @@ diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index d163eff5f..0141c8673 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -24,7 +24,7 @@ @@ -172,7 +172,7 @@ <#if annotdescript?has_content > -

          Header info
          +

          Header definition line
          <#list annotdescript as line>
        • ${line}
        • @@ -255,10 +255,11 @@

          <#if extradocs?size != 0> -

          Inherited arguments

          -

          The arguments described in the entries below can be supplied to this tool to modify - its behavior. For example, the -L argument directs the GATK engine restricts processing - to specific genomic intervals (this is an Engine capability and is therefore available to all GATK walkers).

          +

          Engine arguments

          +

          All tools inherit arguments from the GATK Engine' "CommandLineGATK" argument collection, which can be + used to modify various aspects of the tool's function. For example, the -L argument directs the GATK + engine to restrict processing to specific genomic intervals; or the -rf argument allows you to apply + certain read filters to exclude some of the data from the analysis.