diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java index 729e8138f..949b61ec1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java @@ -75,9 +75,10 @@ import java.util.Collections; import java.util.List; /** - * Implements the per sample allele count and frequency expectation typically with keys MLPSAC and MLPSAF. + * Allele count and frequency expectation per sample + * + * Needs documentation * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ @SuppressWarnings("unused") public final class AlleleCountBySample extends GenotypeAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java index c968bf88f..e5d0d92d6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java @@ -62,12 +62,16 @@ import java.util.*; /** - * U-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities + * Rank Sum Test of REF vs. ALT base quality scores * - *
This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities(ref bases vs. bases of the alternate allele).
+ *This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.
+ * + *The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
* *The base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
+ * */ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation { @Override diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java index dce1b0a70..e1ffbb0f3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java @@ -70,15 +70,21 @@ import java.util.*; /** - * Allele counts and frequency for each ALT allele and total number of alleles in called genotypes + * Counts and frequency of alleles in called genotypes * - *This annotation tool outputs the following: + *
This annotation outputs the following:
* *AC=1;AF=0.500;AN=2+ *
This set of annotations, relating to a heterozygous call(0/1) means there is 1 alternate allele in the genotype. The corresponding allele frequency is 0.5 because there is 1 alternate allele and 1 reference allele in the genotype. + * The total number of alleles in the genotype should be equivalent to the ploidy of the sample.
+ * + */ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java index e80058f3d..660d78a79 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java @@ -60,15 +60,16 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.util.*; /** - * U-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases + * Rank Sum Test for hard-clipped bases on REF vs. ALT reads * - *This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele).
+ *This variant-level annotation tests whether the data supporting the reference allele shows more or less base clipping (hard clips) than those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have more hard-clipped bases than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have fewer hard-clipped bases than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.
+ * + *The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
* *The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
* - * @author rpoplin - * @since 6/28/12 */ public class ClippingRankSumTest extends RankSumTest { @Override diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java index 071658f18..fad666f80 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java @@ -70,13 +70,22 @@ import java.util.List; import java.util.Map; /** - * Total (unfiltered) depth over all samples. + * Total depth of coverage per sample (in FORMAT) and over all samples (in INFO). * - *While the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's - * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth - * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for - * N samples with -dcov D is N * D - *
+ *This annotation is used to provide counts of read depth at two different levels, with some important differences. At the sample level (FORMAT), the DP value is the count of reads that passed the caller's internal quality control metrics (such as MAPQ > 17, for example). At the site level (INFO), the DP value is the unfiltered depth over all samples.
+ * + *See the method documentation on using coverage information for important interpretation details.
+ * + *The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this - * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the - * caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of - * REF and ALT fields) is the unfiltered count of all reads that carried with them the - * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the - * power I have to determine the genotype of the sample at this site, while the AD tells me how many times - * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering - * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like - * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would - * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that - * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. - * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are - * many non-informative reads.
+ *Also known as the allele depth, this annotation gives the unfiltered count of reads that support a given allele for an individual sample. The values in the field are ordered to match the order of alleles specified in the REF and ALT fields: REF, ALT1, ALT2 and so on if there are multiple ALT alleles.
* - *Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation, - * one should not base assumptions about the underlying genotype based on it; - * instead, the genotype likelihoods (PLs) are what determine the genotype calls.
+ *See the method documentation on using coverage information for important interpretation details.
* + *This annotation is similar to the sample-level DP annotation, which counts read depth after general filtering, but with an extra layer of stringency. Its purpose is to provide the count of reads that are actually considered informative by HaplotypeCaller (HC), using pre-read likelihoods that are produced internally by HC.
+ *In this context, an informative read is defined as one that allows the allele it carries to be easily distinguished. In contrast, a read might be considered uninformative if, for example, it only partially overlaps a short tandem repeat and it is not clear whether the read contains the reference allele or an extra repeat.
+ * + *See the method documentation on using coverage information for important interpretation details.
+ * + *Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation - * being seen on only the forward or only the reverse strand) in the reads. More bias is - * indicative of false positive calls. - *
+ *Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The FisherStrand annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It uses Fisher's Exact Test to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.”
+ *The output is a Phred-scaled p-value. The higher the output value, the more likely there is to be bias. More bias is indicative of false positive calls.
+ * + *See the method document on statistical tests for a more detailed explanation of this application of Fisher's Exact Test.
+ * + *The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.
*/ public class FisherStrand extends StrandBiasTest implements StandardAnnotation, ActiveRegionBasedAnnotation { private final static boolean ENABLE_DEBUGGING = false; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java index f726c00bb..54535e32c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java @@ -75,12 +75,10 @@ import java.util.Map; /** * GC content of the reference around the given site * - *The GC content is the number of GC bases relative to the total number of bases (# GC bases / # all bases) around this site on the reference.
+ *The GC content is the number of GC bases relative to the total number of bases (# GC bases / # all bases) around this site on the reference. Some sequencing technologies have trouble with high GC content because of the stronger bonds of G-C nucleotide pairs, so high GC values tend to be associated with low coverage depth and lower confidence calls.
* *The window size used to calculate the GC content around the site is set by the tool used for annotation - * (currently UnifiedGenotyper, HaplotypeCaller or VariantAnnotator). See the Technical Document for each tool - * to find out what window size they use.
+ *The window size used to calculate the GC content around the site is determined by the tool used for annotation (UnifiedGenotyper, HaplotypeCaller or VariantAnnotator). See the Tool Documentation for each of these tools to find out what window size they use.
*/ public class GCContent extends InfoFieldAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java index 4686f27fd..2460e45be 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java @@ -67,13 +67,11 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; import java.util.*; -/********************************* - * Created by rpoplin on 4/5/14. +/** + * Genotype summary statistics * - * Genotype summary statistics. - * - * These summaries can all be recomputed from the genotypes on the fly but it is a lot faster to add them here as INFO field annotations. - ********************************/ + *These summaries can all be recomputed from the genotypes on the fly but it is a lot faster to add them here as INFO field annotations.
+ */ public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java index 2d748b144..9b5778c1d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java @@ -78,9 +78,13 @@ import java.io.Serializable; import java.util.*; /** - * Consistency of the site with two (and only two) segregating haplotypes. Higher scores - * are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls. - * Note that the Haplotype Score is only calculated for sites with read coverage. + * Consistency of the site with strictly two segregating haplotypes + * + *For diploid organisms, barring chromosomal abnormalities, we expect that any given sample has no more than 2 segregating haplotypes at a given site. If there is evidence for more + * than 2 segregating haplotypes, the read data should be considered suspect and the evidence artifactual. Higher scores are indicative of regions with bad alignments, typically leading to artifactual SNP and indel calls.
+ * + *HaplotypeCaller does not output this annotation because it already evaluates haplotype segregation internally. This annotation is only informative (and available) for variants called by Unified Genotyper.
*/ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private final static boolean DEBUG = false; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java index 6f5902300..b511a1b90 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java @@ -74,14 +74,19 @@ import java.util.Map; /** - * Hardy-Weinberg test for disequilibrium + * Hardy-Weinberg test for transmission disequilibrium * - *This annotation calculates the Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium.
+ *This annotation estimates whether the frequencies of alleles and genotypes in a population stay the same from generation to generation.
+ * + *The output is a Phred-scaled P value. See the method document on statistical tests for a more detailed explanation of the Hardy-Weinberg test.
* *Right now we just ignore genotypes that are not confident, but this throws off our HW ratios. - * More analysis is needed to determine the right thing to do when the genotyper cannot decide whether a given sample is het or hom var.
+ *Calculates the length of the largest contiguous homopolymer run of the variant allele in either direction on the reference.
+ *Repetitive sequences such as homopolymers are difficult to map to the reference because they are associated with multiple alignment possibilities. The proximity of a long homopolymer to your variant site increases the chance that reads were mapped incorrectly in the surrounding region and lowers confidence in the call. If there is a homopolymer on either side of a site, this annotation outputs the length of its largest run.
* *This can only be computed for bi-allelic sites.
- *This needs to be computed in a more accurate manner. We currently look only at direct runs of the alternate allele adjacent to this position.
+ *This annotation estimates whether there is evidence of inbreeding in a population. The higher the score, the higher the chance that there is inbreeding.
+ * + *The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is a Phred-scaled p-value derived from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.
+ * + *This variant-level annotation compares the likelihoods of reads to their best haplotype match, between reads that support the reference allele and those that support the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower likelihoods to their best haplotype match than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher likelihoods to their best haplotype match than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.
+ * + *The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for per-read likelihoods to the best haplotype match (likelihoods of reads supporting REF vs. likelihoods of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ * + *The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
+ * */ public class LikelihoodRankSumTest extends RankSumTest { @Override diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java index e876f2bd6..a30924187 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java @@ -71,22 +71,24 @@ import java.util.*; /** * Likelihood of being a Mendelian Violation * - *Given a variant context, this tool uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation - * versus the likelihood of the site transmitting according to mendelian rules.
+ *This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible
This annotation considers all possible combinations of all possible genotypes (homozygous-reference, heterozygous, and homozygous-variant) for each member of a trio, which amounts to 27 possible combinations. Using the Phred-scaled genotype likelihoods (PL values) from each individual, the likelihood of each combination is calculated, and the result contributes to the likelihood of the corresponding case (mendelian violation or non-violation) depending on which set it belongs to. See the method document on statistical tests for a more detailed explanation of this statistical test.
* *This tool assumes that the organism is diploid.
+ *Note that this annotation requires a valid ped file.
- * - *When multiple trios are present, the annotation is simply the maximum - * of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain - * sites and many trios.
- * - *This annotation can only be used from the Variant Annotator. - * If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. - * If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.
*/ public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiringAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java index f9acefbde..9da84183e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java @@ -61,12 +61,22 @@ import java.util.*; /** - * U-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities + * Rank Sum Test for mapping qualities of REF vs. ALT reads * - *This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele).
+ *This variant-level annotation compares the mapping qualities of the reads supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower mapping quality scores than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher mapping quality scores than those supporting the reference allele.
+ *This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants. + * + *
The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
* *The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
+ * + *This anotation gives you the count of all reads that have MAPQ = 0 across all samples. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.
+ * + *Given a variant context, this tool uses the called genotypes (ideally after having been refined using PhaseByTransmission and CalculateGenotypePosteriors) - * to identify possible de novo mutations and the sample in which they occur.
+ *This annotation uses the genotype information from individuals in family trios to identify possible de novo mutations and the sample(s) in which they occur. This works best if the genotypes have been processed according to the Genotype Refinement workflow.
* *This tool assumes that the organism is diploid.
+ *Note that this annotation requires a valid ped file.
- * - *Only reports possible de novos for children where genotype is not filtered (which is most appropriate if parent likelihoods - * have already been factored in using PhaseByTransmission).
- * - *When multiple trios are present, the annotation is simply the maximum - * of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain - * sites and many trios.
- * - *This annotation can only be used from the Variant Annotator. - * If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. - * If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.
*/ public class PossibleDeNovo extends InfoFieldAnnotation implements RodRequiringAnnotation, ExperimentalAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java index 1a9f9a175..004e5d18f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java @@ -71,10 +71,21 @@ import htsjdk.variant.variantcontext.VariantContext; import java.util.*; /** - * Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length. + * Variant confidence normalized by unfiltered depth of variant samples * - * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing - * reads associated with the samples with polymorphic genotypes. + *This annotation puts the variant confidence QUAL score in perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.
+ * + *The calculation only takes into account coverage from samples genotyped as having the variant allele(s). This removes the influence of any homozygous-reference samples that might be present in the same cohort, which would otherwise penalize the call unfairly.
+ * + *This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.
+ * + *This annotation provides an estimation of the overall mapping quality of reads supporting a variant call, averaged over all samples in a cohort.
+ * + *The root mean square is equivalent to the mean of the mapping qualities plus the standard deviation of the mapping qualities.
+ * + *This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error.
+ *This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and alternate alleles. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.
+ * + *The ideal result is a value close to zero, which indicates there is little to no difference in where the alleles are found relative to the ends of reads. A negative value indicates that the alternate allele is found at the ends of reads more often than the reference allele. Conversely, a positive value indicates that the reference allele is found at the ends of reads more often than the alternate allele.
+ * + *This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in relative position either way suggests that the sequencing process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
+ * + *The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
* *The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
+ * */ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java index 8a472f0ac..b0f298048 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java @@ -69,7 +69,9 @@ import java.util.List; import java.util.Map; /** - * List all of the polymorphic samples. + * List of samples that are polymorphic at a given site + * + *The output is a list of the samples that are genotyped as having one or more variant alleles. This allows you to easily determine which samples are polymorphic and compare them to samples that are homozygous-reference.
*/ public class SampleList extends InfoFieldAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java index b1d4241c8..6b7b21b30 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java @@ -70,9 +70,16 @@ import java.util.Map; /** - * Fraction of reads containing spanning deletions at this site + * Fraction of reads containing spanning deletions + * + *The presence of many reads with deletions spanning a given site is often an indication that a variant call made at that site is in fact a false positive. This annotation counts the number of reads that contain deletions spanning the site divided by the total number of reads that cover the site.
+ * + *Note that this annotation is currently not compatible with HaplotypeCaller.
*/ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java index ea4ace84b..143b16edd 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java @@ -66,11 +66,30 @@ import htsjdk.variant.vcf.VCFHeaderLineType; import java.util.*; /** - * Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias - * User: rpoplin - * Date: 8/28/13 + * Number of forward and reverse reads that support REF and ALT alleles + * + *Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandBiasBySample annotation is produces read counts per allele and per strand that are used by other annotation modules (FisherStrand and StrandOddsRatio) to estimate strand bias using statistical approaches. + * + *
This annotation produces 4 values, corresponding to the number of reads that support the following (in that order):
+ *GT:AD:GQ:PL:SB 0/1:53,51:99:1758,0,1835:23,30,33,18+ *
In this example, the reference allele is supported by 23 forward reads and 30 reverse reads, the alternate allele is supported by 33 forward reads and 18 reverse reads.
+ * + *Odds Ratios in the 2x2 contingency table below are R = (X[0][0] * X[1][1]) / (X[0][1] * X[1][0]) and its inverse - * + strand - strand - * Ref X[0][0] X[0][1] - * Alt X[1][0] X[1][1] - * The sum R + 1/R is used to detect a difference in strand bias for ref and for alt (the sum makes it symmetric): - * A high value is indicative of large difference where one entry is very small compared to the others. + *
Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.
* - * A scale factor of refRatio/altRatio where refRatio = (max(X[0][0], X[0][1])/min(X[0][0], X[0][1])) and - * altRatio = (max(X[1][0], X[1][1])/min(X[1][0], X[1][1])) ensures that the annotation value is large only + *Odds Ratios in the 2x2 contingency table below are + * + * $$ R = \frac{X[0][0] * X[1][1]}{X[0][1] * X[1][0]} $$ + * + * and its inverse: + * + *
| + strand | - strand | |
| REF; | X[0][0] | X[0][1] |
| ALT; | X[1][0] | X[1][1] |
See the method document on statistical tests for a more detailed explanation of this statistical test.
+ * + *This tool outputs the number of times the tandem repeat unit is repeated, for each allele (including reference).
+ *This annotation tags variants that fall within tandem repeat sets. It also provides the composition of the tandem repeat units and the number of times they are repeated for each allele (including the REF allele).
+ * + *A tandem repeat unit is composed of one or more nucleotides that are repeated multiple times in series. Repetitive sequences are difficult to map to the reference because they are associated with multiple alignment possibilities. Knowing the number of repeat units in a set of tandem repeats tells you the number of different positions the tandem repeat can be placed in. The observation of many tandem repeat units multiplies the number of possible representations that can be made of the region. + * + *
This annotation is currently not compatible with HaplotypeCaller.
*/ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardAnnotation { private static final String STR_PRESENT = "STR"; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java index beaf954ad..430c71597 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java @@ -72,16 +72,19 @@ import java.util.*; /** * Wittkowski transmission disequilibrium test * - *Test statistic from Wittkowski transmission disequilibrium test. - * The calculation is based on the following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
+ *This annotation indicates the presence of a genetic linkage between a genetic marker and a genetic trait.
+ * + *The calculation is based on the derivation described in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT.
* *Note that this annotation requires a valid ped file.
* *This annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).
+ *This tool assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.). - * It also specifies whether the variant is multiallelic (>2 alleles).
+ *This annotation assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.). It also specifies whether the variant is multiallelic (>2 alleles).
*/ public class VariantType extends InfoFieldAnnotation { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java index e04a0c3e1..9127b5ee2 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java @@ -50,11 +50,27 @@ import java.util.Map; /** * Allele balance across all samples * - *The allele balance is the fraction of ref bases over ref + alt bases.
- * + *This is an experimental annotation that attempts to estimate whether the data supporting a variant call fits allelic ratio expectations, or whether there might be some bias in the data. Each sample will contribute its allelic read depth (from the AD annotation) to either ABHom or ABHet depending on its genotype call: ABHom if the call is homozygous (REF/REF or ALT/ALT), and ABHet if the call is heterozygous (REF/ALT). Additionally, reads that support something other than the genotyped alleles (called "non-alleles") will be counted in the OND tag, which represents the overall fraction of data that diverges from the diploid hypothesis.
+ * $$ ABHom = \frac{# ALT alleles}{total # alleles} $$
+ * $$ ABHet = \frac{# REF alleles}{# total alleles} $$
+ * $$ OND = \frac{# genotyped alleles}{# alleles + # non-alleles} $$
+ *
For ABHom, the value should be close to 1.00 because ideally, all the reads should support a single allele. For ABHet, the value should be close to 0.5, so half of the alleles support the ref allele and half of the alleles support the alt allele. Divergence from these expected ratios may indicate that there is some bias in favor of one allele. Note the caveats below regarding cancer and RNAseq analysis.
*Note that this annotation will only work properly for biallelic samples that are called as heterozygous.
+ *The allele balance is the fraction of ref bases over ref + alt bases.
- * + *This is an experimental annotation that attempts to estimate whether the data supporting a heterozygous genotype call fits allelic ratio expectations, or whether there might be some bias in the data.
+ *$$ AB = \frac{# ALT alleles}{total # alleles} $$
+ *Ideally, the value of AB should be close to 0.5, so half of the alleles support the ref allele and half of the alleles support the alt allele. Divergence from the expected ratio may indicate that there is some bias in favor of one allele. Note the caveats below regarding cancer and RNAseq analysis.
*Note that this annotation will only work properly for biallelic samples that are called as heterozygous.
- *This annotation returns the counts of A, C, G, and T bases across all samples, in that order.
+ *BaseCounts=3,0,3,0+ * + *
+ * This means the number of A bases seen is 3, the number of T bases seen is 0, the number of G bases seen is 3, and the number of T bases seen is 0. + *
+ * + *This annotation tells you what fraction of reads have a mapping quality of less than the given threshold of 10 (including 0). Note that certain tools may impose a different minimum mapping quality threshold. For example, HaplotypeCaller excludes reads with MAPQ<20.
+ * + *$$ LowMQ = \frac{# reads with MAPQ=0 + # reads with MAPQ<10}{total # reads} $$ + *
+ * + *This annotation gives you the count of all reads that have MAPQ = 0 for each sample. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.
+ * + *N occurs in a sequence when the sequencer does not have enough information to determine which base it should call. The presence of many Ns at the same site lowers our confidence in any calls made there, because it suggests that there was some kind of technical difficulty that interfered with the sequencing process.
+ * + *Note that in GATK versions 3.2 and earlier, this annotation only counted N bases from reads generated with SOLiD technology. This functionality was generalized for all sequencing platforms in GATK version 3.3.
+ * + *See http://snpeff.sourceforge.net/ for more information on the SnpEff tool
. + *This annotation processes the output of the SnpEff functional prediction tool to select only the predicted effect with the highest biological impact. The SnpEff output must be provided on the command line by specifying "--snpEffFile filename.vcf". See http://snpeff.sourceforge.net/ for more information about the SnpEff tool
. * - *For each variant, this tool chooses one of the effects of highest biological impact from the SnpEff - * output file (which must be provided on the command line via --snpEffFile filename.vcf), - * and adds annotations on that effect.
+ *