Merge pull request #755 from broadinstitute/sc_Annotation_Docs_73647570
Improvements to documentation of variant annotations
This commit is contained in:
commit
0f89d1a362
|
|
@ -75,9 +75,10 @@ import java.util.Collections;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Implements the per sample allele count and frequency expectation typically with keys MLPSAC and MLPSAF.
|
||||
* Allele count and frequency expectation per sample
|
||||
*
|
||||
* Needs documentation
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
public final class AlleleCountBySample extends GenotypeAnnotation {
|
||||
|
|
|
|||
|
|
@ -62,12 +62,16 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities
|
||||
* Rank Sum Test of REF vs. ALT base quality scores
|
||||
*
|
||||
* <p>This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities(ref bases vs. bases of the alternate allele).</p>
|
||||
* <p>This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*
|
||||
*/
|
||||
public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -70,15 +70,21 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* Allele counts and frequency for each ALT allele and total number of alleles in called genotypes
|
||||
* Counts and frequency of alleles in called genotypes
|
||||
*
|
||||
* <p>This annotation tool outputs the following:
|
||||
* <p>This annotation outputs the following:</p>
|
||||
*
|
||||
* <ul>
|
||||
* <li>Allele count in genotypes, for each ALT allele, in the same order as listed</li>
|
||||
* <li>Allele Frequency, for each ALT allele, in the same order as listed</li>
|
||||
* <li>Total number of alleles in called genotypes</li>
|
||||
* </ul></p>
|
||||
* <li>Number of times each ALT allele is represented, in the same order as listed (AC)</li>
|
||||
* <li>Frequency of each ALT allele, in the same order as listed (AF)</li>
|
||||
* <li>Total number of alleles in called genotypes (AN)</li>
|
||||
* </ul>
|
||||
* <h3>Example</h3>
|
||||
* <pre>AC=1;AF=0.500;AN=2</pre>
|
||||
* <p>This set of annotations, relating to a heterozygous call(0/1) means there is 1 alternate allele in the genotype. The corresponding allele frequency is 0.5 because there is 1 alternate allele and 1 reference allele in the genotype.
|
||||
* The total number of alleles in the genotype should be equivalent to the ploidy of the sample.</p>
|
||||
*
|
||||
|
||||
*/
|
||||
public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -60,15 +60,16 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases
|
||||
* Rank Sum Test for hard-clipped bases on REF vs. ALT reads
|
||||
*
|
||||
* <p>This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele).</p>
|
||||
* <p>This variant-level annotation tests whether the data supporting the reference allele shows more or less base clipping (hard clips) than those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have more hard-clipped bases than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have fewer hard-clipped bases than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since 6/28/12
|
||||
*/
|
||||
public class ClippingRankSumTest extends RankSumTest {
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -70,13 +70,22 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Total (unfiltered) depth over all samples.
|
||||
* Total depth of coverage per sample (in FORMAT) and over all samples (in INFO).
|
||||
*
|
||||
* <p>While the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's
|
||||
* internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth
|
||||
* over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for
|
||||
* N samples with -dcov D is N * D
|
||||
* </p>
|
||||
* <p>This annotation is used to provide counts of read depth at two different levels, with some important differences. At the sample level (FORMAT), the DP value is the count of reads that passed the caller's internal quality control metrics (such as MAPQ > 17, for example). At the site level (INFO), the DP value is the unfiltered depth over all samples.</p>
|
||||
*
|
||||
* <p>See the method documentation on <a href="http://www.broadinstitute.org/gatk/guide/article?id=4721">using coverage information</a> for important interpretation details.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>If downsampling is enabled (as is done by default for some analyses to remove excessive coverage), the depth of coverage effectively seen by the caller may be inferior to the actual depth of coverage in the original file. If using `-dcov D`, the maximum depth that can be seen for N samples will be N * D.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> calculates depth of coverage for each allele per sample (AD).</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerSampleHC.php">DepthPerSampleHC</a></b> calculates depth of coverage after filtering by HaplotypeCaller.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -74,26 +74,25 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* The depth of coverage of each allele per sample
|
||||
* Depth of coverage of each allele per sample
|
||||
*
|
||||
* <p>The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this
|
||||
* sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the
|
||||
* caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of
|
||||
* REF and ALT fields) is the unfiltered count of all reads that carried with them the
|
||||
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
|
||||
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
|
||||
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
|
||||
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
|
||||
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
|
||||
* normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
|
||||
* the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted.
|
||||
* Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are
|
||||
* many non-informative reads.</p>
|
||||
* <p>Also known as the allele depth, this annotation gives the unfiltered count of reads that support a given allele for an individual sample. The values in the field are ordered to match the order of alleles specified in the REF and ALT fields: REF, ALT1, ALT2 and so on if there are multiple ALT alleles.</p>
|
||||
*
|
||||
* <p>Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation,
|
||||
* <b>one should not base assumptions about the underlying genotype based on it</b>;
|
||||
* instead, the genotype likelihoods (PLs) are what determine the genotype calls.</p>
|
||||
* <p>See the method documentation on <a href="http://www.broadinstitute.org/gatk/guide/article?id=4721">using coverage information</a> for important interpretation details.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>The AD calculation as performed by HaplotypeCaller may not yield exact results because only reads that statistically favor one allele over the other are counted. Due to this fact, the sum of AD may be different than the individual sample depth, especially when there are many non-informative reads.</li>
|
||||
* <li>For the AD calculation as performed by the UnifiedGenotyper, the same caveat as above applies to indels (but not to SNPs).</li>
|
||||
* <li>Because the AD includes reads and bases that were filtered by the caller (and in case of indels, is based on a statistical computation), it should not be used to make assumptions about the genotype that it is associated with. Ultimately, the phred-scaled genotype likelihoods (PLs) are what determines the genotype calls.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage</a></b> gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalance.php">AlleleBallance</a></b> is a generalization of this annotation over all samples.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalanceBySample.php">AlleleBallanceBySample</a></b> calculates allele balance for each individual sample.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -71,15 +71,24 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* The depth of coverage for informative reads for each sample.
|
||||
* Depth of informative coverage for each sample.
|
||||
*
|
||||
* <p>This annotation is similar to the sample-level DP annotation, which counts read depth after general filtering, but with an extra layer of stringency. Its purpose is to provide the count of reads that are actually considered informative by HaplotypeCaller (HC), using pre-read likelihoods that are produced internally by HC.</p>
|
||||
* <p>In this context, an informative read is defined as one that allows the allele it carries to be easily distinguished. In contrast, a read might be considered uninformative if, for example, it only partially overlaps a short tandem repeat and it is not clear whether the read contains the reference allele or an extra repeat.</p>
|
||||
*
|
||||
* <p>See the method documentation on <a href="http://www.broadinstitute.org/gatk/guide/article?id=4721">using coverage information</a> for important interpretation details.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>This annotation can only be generated by HaplotypeCaller (it will not work when called from VariantAnnotator).</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> calculates depth of coverage for each allele per sample (AD).</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage</a></b> gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.</li>
|
||||
* </ul>
|
||||
*
|
||||
* An informative read is defined as one from which the allele it carries can be easily distinguished. An example of a
|
||||
* case where a read might be uninformative is where it only partially overlaps a short tandem repeat and it is not clear
|
||||
* whether the read contains the reference allele or e.g. an extra repeat.
|
||||
* The depth here is the sum of the informative reads at this site as determined by the Haplotype Caller; as such it can
|
||||
* only be calculated and generated through the Haplotype Caller (it will not work when run through the Variant Annotator).
|
||||
* This calculation is not perfect but it is a pretty good proxy for depth and it does match the values in the AD field
|
||||
* (i.e., sum(AD) = DP).
|
||||
*/
|
||||
public class DepthPerSampleHC extends GenotypeAnnotation {
|
||||
public void annotate(final RefMetaDataTracker tracker,
|
||||
|
|
|
|||
|
|
@ -75,15 +75,25 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* Phred-scaled p-value using Fisher's Exact Test to detect strand bias
|
||||
* Strand bias estimated using Fisher's Exact Test
|
||||
*
|
||||
* <p>Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
|
||||
* being seen on only the forward or only the reverse strand) in the reads. More bias is
|
||||
* indicative of false positive calls.
|
||||
* </p>
|
||||
* <p>Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The FisherStrand annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It uses Fisher's Exact Test to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.”</p>
|
||||
* <p>The output is a Phred-scaled p-value. The higher the output value, the more likely there is to be bias. More bias is indicative of false positive calls.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this application of Fisher's Exact Test.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>The FisherStrand test may not be calculated for certain complex indel cases or for multi-allelic sites.</li>
|
||||
* <li>FisherStrand is best suited for low coverage situations. For testing strand bias in higher coverage situations, see the StrandOddsRatio annotation.</li>
|
||||
* </ul>
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandBiasBySample.php">StrandBiasBySample</a></b> outputs counts of read depth per allele for each strand orientation.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandOddsRatio.php">StrandOddsRatio</a></b> is an updated form of FisherStrand that uses a symmetric odds ratio calculation.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.</p>
|
||||
*/
|
||||
public class FisherStrand extends StrandBiasTest implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
private final static boolean ENABLE_DEBUGGING = false;
|
||||
|
|
|
|||
|
|
@ -75,12 +75,10 @@ import java.util.Map;
|
|||
/**
|
||||
* GC content of the reference around the given site
|
||||
*
|
||||
* <p>The GC content is the number of GC bases relative to the total number of bases (# GC bases / # all bases) around this site on the reference.</p>
|
||||
* <p>The GC content is the number of GC bases relative to the total number of bases (# GC bases / # all bases) around this site on the reference. Some sequencing technologies have trouble with high GC content because of the stronger bonds of G-C nucleotide pairs, so high GC values tend to be associated with low coverage depth and lower confidence calls.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The window size used to calculate the GC content around the site is set by the tool used for annotation
|
||||
* (currently UnifiedGenotyper, HaplotypeCaller or VariantAnnotator). See the Technical Document for each tool
|
||||
* to find out what window size they use.</p>
|
||||
* <p>The window size used to calculate the GC content around the site is determined by the tool used for annotation (UnifiedGenotyper, HaplotypeCaller or VariantAnnotator). See the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/">Tool Documentation</a> for each of these tools to find out what window size they use.</p>
|
||||
*/
|
||||
public class GCContent extends InfoFieldAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -67,13 +67,11 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine;
|
|||
|
||||
import java.util.*;
|
||||
|
||||
/*********************************
|
||||
* Created by rpoplin on 4/5/14.
|
||||
/**
|
||||
* Genotype summary statistics
|
||||
*
|
||||
* Genotype summary statistics.
|
||||
*
|
||||
* These summaries can all be recomputed from the genotypes on the fly but it is a lot faster to add them here as INFO field annotations.
|
||||
********************************/
|
||||
* <p>These summaries can all be recomputed from the genotypes on the fly but it is a lot faster to add them here as INFO field annotations.</p>
|
||||
*/
|
||||
|
||||
public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -78,9 +78,13 @@ import java.io.Serializable;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Consistency of the site with two (and only two) segregating haplotypes. Higher scores
|
||||
* are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls.
|
||||
* Note that the Haplotype Score is only calculated for sites with read coverage.
|
||||
* Consistency of the site with strictly two segregating haplotypes
|
||||
*
|
||||
* <p>For diploid organisms, barring chromosomal abnormalities, we expect that any given sample has no more than 2 segregating haplotypes at a given site. If there is evidence for more
|
||||
* than 2 segregating haplotypes, the read data should be considered suspect and the evidence artifactual. Higher scores are indicative of regions with bad alignments, typically leading to artifactual SNP and indel calls.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <p>HaplotypeCaller does not output this annotation because it already evaluates haplotype segregation internally. This annotation is only informative (and available) for variants called by Unified Genotyper.</p>
|
||||
*/
|
||||
public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
private final static boolean DEBUG = false;
|
||||
|
|
|
|||
|
|
@ -74,14 +74,19 @@ import java.util.Map;
|
|||
|
||||
|
||||
/**
|
||||
* Hardy-Weinberg test for disequilibrium
|
||||
* Hardy-Weinberg test for transmission disequilibrium
|
||||
*
|
||||
* <p>This annotation calculates the Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium.</p>
|
||||
* <p>This annotation estimates whether the frequencies of alleles and genotypes in a population stay the same from generation to generation.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The output is a Phred-scaled P value. See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the Hardy-Weinberg test.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <h4>This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.</h4>
|
||||
* <p>Right now we just ignore genotypes that are not confident, but this throws off our HW ratios.
|
||||
* More analysis is needed to determine the right thing to do when the genotyper cannot decide whether a given sample is het or hom var.</p>
|
||||
* <ul>
|
||||
* <li>This annotation requires multiple samples and a valid pedigree file.</li>
|
||||
* <li>This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.</li>
|
||||
* <li>Low confidence genotypes are ignored, which may adversely affect HW ratios. More analysis is needed to determine the right thing to do when the genotyper cannot decide whether a given sample is heterozygous or homozygous variant.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -71,12 +71,14 @@ import java.util.Map;
|
|||
/**
|
||||
* Largest contiguous homopolymer run of the variant allele
|
||||
*
|
||||
* <p>Calculates the length of the largest contiguous homopolymer run of the variant allele in either direction on the reference.</p>
|
||||
* <p>Repetitive sequences such as homopolymers are difficult to map to the reference because they are associated with multiple alignment possibilities. The proximity of a long homopolymer to your variant site increases the chance that reads were mapped incorrectly in the surrounding region and lowers confidence in the call. If there is a homopolymer on either side of a site, this annotation outputs the length of its largest run.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <p>This can only be computed for bi-allelic sites.</p>
|
||||
* <h4>This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.</h4>
|
||||
* <p>This needs to be computed in a more accurate manner. We currently look only at direct runs of the alternate allele adjacent to this position.</p>
|
||||
* <ul>
|
||||
* <li>This can only be computed for bi-allelic sites.</li>
|
||||
* <li>The calculation only looks at direct runs of the alternate allele adjacent to this position, which is not a very accurate method.</li>
|
||||
* <li>This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class HomopolymerRun extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -72,12 +72,16 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* Likelihood-based (using PL field) test for the inbreeding among samples.
|
||||
* Likelihood-based test for the inbreeding among samples
|
||||
*
|
||||
* <p>This annotation estimates whether there is evidence of inbreeding in a population. The higher the score, the higher the chance that there is inbreeding.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is a Phred-scaled p-value derived from running the HW test for disequilibrium with PL values. See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this statistical test.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <h4>Note that the Inbreeding Coefficient can only be calculated for cohorts containing at least 10 founder samples.</h4>
|
||||
*
|
||||
* A continuous generalization of the Hardy-Weinberg test for disequilibrium that works
|
||||
* well with limited coverage per sample. See the 1000 Genomes Phase I release for
|
||||
* more information. Note that the Inbreeding Coefficient can only be calculated for
|
||||
* cohorts containing at least 10 founder samples.
|
||||
*/
|
||||
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -60,9 +60,16 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test contrasting the likelihoods of reads to their
|
||||
* most likely haplotypes. This is effectively testing for a differentiate quality in the modeling of the alt
|
||||
* allele than the reference allele.
|
||||
* Rank Sum Test of per-read likelihoods of REF vs. ALT reads
|
||||
*
|
||||
* <p>This variant-level annotation compares the likelihoods of reads to their best haplotype match, between reads that support the reference allele and those that support the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower likelihoods to their best haplotype match than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher likelihoods to their best haplotype match than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for per-read likelihoods to the best haplotype match (likelihoods of reads supporting REF vs. likelihoods of reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*
|
||||
*/
|
||||
public class LikelihoodRankSumTest extends RankSumTest {
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -71,22 +71,24 @@ import java.util.*;
|
|||
/**
|
||||
* Likelihood of being a Mendelian Violation
|
||||
*
|
||||
* <p>Given a variant context, this tool uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation
|
||||
* versus the likelihood of the site transmitting according to mendelian rules. </p>
|
||||
* <p>This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible <em><de novo/em> mutation. The higher the output value, the more likely there is to be a Mendelian violation. Note that only positive values indicating likely MVs will be annotated; if the value for a given site is negative (indicating that there is no violation) the annotation is not written to the file.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>This annotation considers all possible combinations of all possible genotypes (homozygous-reference, heterozygous, and homozygous-variant) for each member of a trio, which amounts to 27 possible combinations. Using the Phred-scaled genotype likelihoods (PL values) from each individual, the likelihood of each combination is calculated, and the result contributes to the likelihood of the corresponding case (mendelian violation or non-violation) depending on which set it belongs to. See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this statistical test.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>The calculation assumes that the organism is diploid.</li>
|
||||
* <li>This annotation requires a valid pedigree file.</li>
|
||||
* <li>When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.</li>
|
||||
* <li>This annotation can only be used from the Variant Annotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>This tool assumes that the organism is diploid.</p>
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_PossibleDeNovo.php">PossibleDeNovo</a></b> annotates the existence of a de novo mutation in at least one of a set of families/trios.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>Note that this annotation requires a valid ped file.</p>
|
||||
*
|
||||
* <p>When multiple trios are present, the annotation is simply the maximum
|
||||
* of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain
|
||||
* sites and many trios.</p>
|
||||
*
|
||||
* <p>This annotation can only be used from the Variant Annotator.
|
||||
* If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect.
|
||||
* If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.</p>
|
||||
*/
|
||||
|
||||
public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiringAnnotation {
|
||||
|
|
|
|||
|
|
@ -61,12 +61,22 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities
|
||||
* Rank Sum Test for mapping qualities of REF vs. ALT reads
|
||||
*
|
||||
* <p>This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele).</p>
|
||||
* <p>This variant-level annotation compares the mapping qualities of the reads supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower mapping quality scores than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher mapping quality scores than those supporting the reference allele.</p>
|
||||
* <p>This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_RMSMappingQuality.php">RMSMappingQuality</a></b> gives an estimation of the overal read mapping quality supporting a variant call.</li>
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -74,7 +74,16 @@ import java.util.Map;
|
|||
|
||||
|
||||
/**
|
||||
* Total count across all samples of mapping quality zero reads
|
||||
* Count of all reads with MAPQ = 0 across all samples
|
||||
*
|
||||
* <p>This anotation gives you the count of all reads that have MAPQ = 0 across all samples. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.</p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityZeroBySample.php">MappingQualityZeroBySample</a></b> gives the count of reads with MAPQ=0 for each individual sample.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_LowMQ.php">LowMQ</a></b> gives the proportion of reads with low mapping quality (MAPQ below 10, including 0).</li>
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -71,27 +71,25 @@ import htsjdk.variant.variantcontext.VariantContext;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Tags variants with called genotypes that support the existence of a de novo mutation in at least one of the given families
|
||||
* Existence of a de novo mutation in at least one of the given families
|
||||
*
|
||||
* <p>Given a variant context, this tool uses the called genotypes (ideally after having been refined using PhaseByTransmission and CalculateGenotypePosteriors)
|
||||
* to identify possible de novo mutations and the sample in which they occur. </p>
|
||||
* <p>This annotation uses the genotype information from individuals in family trios to identify possible de novo mutations and the sample(s) in which they occur. This works best if the genotypes have been processed according to the <a href="https://www.broadinstitute.org/gatk/guide/article?id=4723">Genotype Refinement workflow</a>.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>The calculation assumes that the organism is diploid.</li>
|
||||
* <li>This annotation requires a valid pedigree file.</li>
|
||||
* <li>Only reports possible de novos for children whose genotypes have not been tagged as filtered (which is most appropriate if parent likelihoods
|
||||
* have already been factored in using PhaseByTransmission).</li>
|
||||
* <li>When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.</li>
|
||||
* <li>This annotation can only be used from the Variant Annotator.If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>This tool assumes that the organism is diploid.</p>
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MVLikelihoodRatio.php">MVLikelihoodRatio</a></b> evaluates whether a site is transmitted from parents to offspring according to Mendelian rules or not.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>Note that this annotation requires a valid ped file.</p>
|
||||
*
|
||||
* <p>Only reports possible de novos for children where genotype is not filtered (which is most appropriate if parent likelihoods
|
||||
* have already been factored in using PhaseByTransmission).</p>
|
||||
*
|
||||
* <p>When multiple trios are present, the annotation is simply the maximum
|
||||
* of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain
|
||||
* sites and many trios.</p>
|
||||
*
|
||||
* <p>This annotation can only be used from the Variant Annotator.
|
||||
* If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect.
|
||||
* If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.</p>
|
||||
*/
|
||||
|
||||
public class PossibleDeNovo extends InfoFieldAnnotation implements RodRequiringAnnotation, ExperimentalAnnotation {
|
||||
|
|
|
|||
|
|
@ -71,10 +71,21 @@ import htsjdk.variant.variantcontext.VariantContext;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length.
|
||||
* Variant confidence normalized by unfiltered depth of variant samples
|
||||
*
|
||||
* Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing
|
||||
* reads associated with the samples with polymorphic genotypes.
|
||||
* <p>This annotation puts the variant confidence QUAL score in perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The calculation only takes into account coverage from samples genotyped as having the variant allele(s). This removes the influence of any homozygous-reference samples that might be present in the same cohort, which would otherwise penalize the call unfairly.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <p>This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.</p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage</a></b> gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> calculates depth of coverage for each allele per sample (AD).</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
// private final static Logger logger = Logger.getLogger(QualByDepth.class);
|
||||
|
|
|
|||
|
|
@ -72,7 +72,18 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* Root Mean Square of the mapping quality of the reads across all samples.
|
||||
* Root Mean Square of the mapping quality of reads across all samples.
|
||||
*
|
||||
* <p>This annotation provides an estimation of the overall mapping quality of reads supporting a variant call, averaged over all samples in a cohort.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The root mean square is equivalent to the mean of the mapping qualities plus the standard deviation of the mapping qualities.</p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityRankSumTest.php">MappingQualityRankSumTest</a></b> compares the mapping quality of reads supporting the REF and ALT alleles.</li>
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -67,12 +67,20 @@ import org.broadinstitute.gatk.utils.sam.ReadUtils;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele
|
||||
* Rank Sum Test for relative positioning of REF vs. ALT alleles within reads
|
||||
*
|
||||
* <p>This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error.</p>
|
||||
* <p>This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and alternate alleles. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.</p>
|
||||
*
|
||||
* <p>The ideal result is a value close to zero, which indicates there is little to no difference in where the alleles are found relative to the ends of reads. A negative value indicates that the alternate allele is found at the ends of reads more often than the reference allele. Conversely, a positive value indicates that the reference allele is found at the ends of reads more often than the alternate allele. </p>
|
||||
*
|
||||
* <p>This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in relative position either way suggests that the sequencing process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of the ranksum test.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*
|
||||
*/
|
||||
public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -69,7 +69,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* List all of the polymorphic samples.
|
||||
* List of samples that are polymorphic at a given site
|
||||
*
|
||||
* <p>The output is a list of the samples that are genotyped as having one or more variant alleles. This allows you to easily determine which samples are polymorphic and compare them to samples that are homozygous-reference.</p>
|
||||
*/
|
||||
public class SampleList extends InfoFieldAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -70,9 +70,16 @@ import java.util.Map;
|
|||
|
||||
|
||||
/**
|
||||
* Fraction of reads containing spanning deletions at this site
|
||||
* Fraction of reads containing spanning deletions
|
||||
*
|
||||
* <p>The presence of many reads with deletions spanning a given site is often an indication that a variant call made at that site is in fact a false positive. This annotation counts the number of reads that contain deletions spanning the site divided by the total number of reads that cover the site.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>This annotation is not compatible with HaplotypeCaller; its purpose is to compensate for the UnifiedGenotyper's inability to integrate SNPs and indels in the same model (unlike HaplotypeCaller)</li>
|
||||
* <li>By default, the UnifiedGenotyper will not call variants where the fraction of spanning deletions is above a certain threshold. This threshold can be adjusted using the `--max_deletion_fraction` argument.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>Note that this annotation is currently not compatible with HaplotypeCaller.</p>
|
||||
*/
|
||||
public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -66,11 +66,30 @@ import htsjdk.variant.vcf.VCFHeaderLineType;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias
|
||||
* User: rpoplin
|
||||
* Date: 8/28/13
|
||||
* Number of forward and reverse reads that support REF and ALT alleles
|
||||
*
|
||||
* <p>Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandBiasBySample annotation is produces read counts per allele and per strand that are used by other annotation modules (FisherStrand and StrandOddsRatio) to estimate strand bias using statistical approaches.
|
||||
*
|
||||
* <p>This annotation produces 4 values, corresponding to the number of reads that support the following (in that order):</p>
|
||||
* <ul>
|
||||
* <li>the reference allele on the forward strand</li>
|
||||
* <li>the reference allele on the reverse strand</li>
|
||||
* <li>the alternate allele on the forward strand</li>
|
||||
* <li>the alternate allele on the reverse strand</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h3>Example</h3>
|
||||
* <pre>GT:AD:GQ:PL:SB 0/1:53,51:99:1758,0,1835:23,30,33,18</pre>
|
||||
* <p>In this example, the reference allele is supported by 23 forward reads and 30 reverse reads, the alternate allele is supported by 33 forward reads and 18 reverse reads.</p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_FisherStrand.php">FisherStrand</a></b> uses Fisher's Exact Test to evaluate strand bias.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandOddsRatio.php">StrandOddsRatio</a></b> is an updated form of FisherStrand that uses a symmetric odds ratio calculation.</li>
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
|
||||
public class StrandBiasBySample extends GenotypeAnnotation {
|
||||
|
||||
public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB";
|
||||
|
|
|
|||
|
|
@ -67,18 +67,37 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Symmetric Odds Ratio to detect strand bias
|
||||
* Strand bias estimated by the Symmetric Odds Ratio test
|
||||
*
|
||||
* <p> Odds Ratios in the 2x2 contingency table below are R = (X[0][0] * X[1][1]) / (X[0][1] * X[1][0]) and its inverse
|
||||
* + strand - strand
|
||||
* Ref X[0][0] X[0][1]
|
||||
* Alt X[1][0] X[1][1]
|
||||
* The sum R + 1/R is used to detect a difference in strand bias for ref and for alt (the sum makes it symmetric):
|
||||
* A high value is indicative of large difference where one entry is very small compared to the others.
|
||||
* <p>Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.</p>
|
||||
*
|
||||
* A scale factor of refRatio/altRatio where refRatio = (max(X[0][0], X[0][1])/min(X[0][0], X[0][1])) and
|
||||
* altRatio = (max(X[1][0], X[1][1])/min(X[1][0], X[1][1])) ensures that the annotation value is large only
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p> Odds Ratios in the 2x2 contingency table below are
|
||||
*
|
||||
* $$ R = \frac{X[0][0] * X[1][1]}{X[0][1] * X[1][0]} $$
|
||||
*
|
||||
* and its inverse:
|
||||
*
|
||||
* <table>
|
||||
* <tr><td> </td><td>+ strand </td><td>- strand</td></tr>
|
||||
* <tr><td>REF;</td><td>X[0][0]</td><td>X[0][1]</td></tr>
|
||||
* <tr><td>ALT;</td><td>X[1][0]</td><td>X[1][1]</td></tr>
|
||||
* </table>
|
||||
*
|
||||
* The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where
|
||||
* $$ refRatio = \frac{max(X[0][0], X[0][1])}{min(X[0][0], X[0][1} $$
|
||||
* and
|
||||
* $$ altRatio = \frac{max(X[1][0], X[1][1])}{min(X[1][0], X[1][1]} $$
|
||||
* ensures that the annotation value is large only.
|
||||
* </p>
|
||||
* <p>See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a> for a more detailed explanation of this statistical test.</p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandBiasBySample.php">StrandBiasBySample</a></b> outputs counts of read depth per allele for each strand orientation.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_FisherStrand.php">FisherStrand</a></b> uses Fisher's Exact Test to evaluate strand bias.</li>
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
private final static double AUGMENTATION_CONSTANT = 1.0;
|
||||
|
|
|
|||
|
|
@ -71,12 +71,17 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Annotates variants that are composed of tandem repeats
|
||||
* Tandem repeat unit composition and counts per allele
|
||||
*
|
||||
* <p>This tool outputs the number of times the tandem repeat unit is repeated, for each allele (including reference).</p>
|
||||
* <p>This annotation tags variants that fall within tandem repeat sets. It also provides the composition of the tandem repeat units and the number of times they are repeated for each allele (including the REF allele).</p>
|
||||
*
|
||||
* <p>A tandem repeat unit is composed of one or more nucleotides that are repeated multiple times in series. Repetitive sequences are difficult to map to the reference because they are associated with multiple alignment possibilities. Knowing the number of repeat units in a set of tandem repeats tells you the number of different positions the tandem repeat can be placed in. The observation of many tandem repeat units multiplies the number of possible representations that can be made of the region.
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>This annotation is currently not compatible with HaplotypeCaller.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h2>Caveat</h2>
|
||||
* <p>This annotation is currently not compatible with HaplotypeCaller.</p>
|
||||
*/
|
||||
public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
private static final String STR_PRESENT = "STR";
|
||||
|
|
|
|||
|
|
@ -72,16 +72,19 @@ import java.util.*;
|
|||
/**
|
||||
* Wittkowski transmission disequilibrium test
|
||||
*
|
||||
* <p>Test statistic from Wittkowski transmission disequilibrium test.
|
||||
* The calculation is based on the following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT</p>
|
||||
* <p>This annotation indicates the presence of a genetic linkage between a genetic marker and a genetic trait.</p>
|
||||
*
|
||||
* <h3>Statistical notes</h3>
|
||||
* <p>The calculation is based on the derivation described in <a href="http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT">http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT</a>.</p>
|
||||
*
|
||||
* <p>Note that this annotation requires a valid ped file.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>This annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).</p>
|
||||
* <ul>
|
||||
* <li>This annotation requires a valid pedigree file.</li>
|
||||
* <li>This annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).</li>
|
||||
* </ul>
|
||||
*
|
||||
* @author rpoplin, lfran, ebanks
|
||||
* @since 11/14/11
|
||||
*/
|
||||
|
||||
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements RodRequiringAnnotation {
|
||||
|
|
|
|||
|
|
@ -65,10 +65,9 @@ import htsjdk.variant.variantcontext.VariantContext;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.)
|
||||
* General category of variant
|
||||
*
|
||||
* <p>This tool assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.).
|
||||
* It also specifies whether the variant is multiallelic (>2 alleles).</p>
|
||||
* <p>This annotation assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.). It also specifies whether the variant is multiallelic (>2 alleles).</p>
|
||||
*/
|
||||
public class VariantType extends InfoFieldAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -50,11 +50,27 @@ import java.util.Map;
|
|||
/**
|
||||
* Allele balance across all samples
|
||||
*
|
||||
* <p>The allele balance is the fraction of ref bases over ref + alt bases.</p>
|
||||
*
|
||||
* <p> This is an experimental annotation that attempts to estimate whether the data supporting a variant call fits allelic ratio expectations, or whether there might be some bias in the data. Each sample will contribute its allelic read depth (from the AD annotation) to either ABHom or ABHet depending on its genotype call: ABHom if the call is homozygous (REF/REF or ALT/ALT), and ABHet if the call is heterozygous (REF/ALT). Additionally, reads that support something other than the genotyped alleles (called "non-alleles") will be counted in the OND tag, which represents the overall fraction of data that diverges from the diploid hypothesis.</p>
|
||||
* <h3>Calculations</h3>
|
||||
* <p> $$ ABHom = \frac{# ALT alleles}{total # alleles} $$ <br />
|
||||
* $$ ABHet = \frac{# REF alleles}{# total alleles} $$ <br />
|
||||
* $$ OND = \frac{# genotyped alleles}{# alleles + # non-alleles} $$
|
||||
* </p>
|
||||
* <p> For ABHom, the value should be close to 1.00 because ideally, all the reads should support a single allele. For ABHet, the value should be close to 0.5, so half of the alleles support the ref allele and half of the alleles support the alt allele. Divergence from these expected ratios may indicate that there is some bias in favor of one allele. Note the caveats below regarding cancer and RNAseq analysis. </p>
|
||||
* <h3>Caveats</h3>
|
||||
* <p>Note that this annotation will only work properly for biallelic samples that are called as heterozygous.</p>
|
||||
* <ul>
|
||||
* <li>This annotation will only work properly for biallelic variants where all samples are called heterozygous or homozygous.</li>
|
||||
* <li>This annotation cannot currently be calculated for indels.</li>
|
||||
* <li>tThe reasoning underlying this annotation only applies to germline variants in DNA sequencing data. In somatic/cancer analysis, divergent ratios are expected due to tumor heterogeneity. In RNAseq analysis, divergent ratios may indicate differential allele expression.</li>
|
||||
* <li>As stated above, this annotation is experimental and should be interpreted with caution as we cannot guarantee that it is appropriate. Basically, use it at your own risk.</li>
|
||||
* </ul>
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalanceBySample.php">AlleleBallanceBySample</a></b> calculates allele balance for each individual sample.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> calculates depth of coverage for each allele per sample.</li>
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
public class AlleleBalance extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
|
|
@ -196,7 +212,7 @@ public class AlleleBalance extends InfoFieldAnnotation {
|
|||
|
||||
public List<String> getKeyNames() { return Arrays.asList("ABHet","ABHom","OND"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ABHet", 1, VCFHeaderLineType.Float, "Allele Balance for hets (ref/(ref+alt))"),
|
||||
new VCFInfoHeaderLine("ABHom", 1, VCFHeaderLineType.Float, "Allele Balance for homs (A/(A+O))"),
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ABHet", 1, VCFHeaderLineType.Float, "Allele Balance for heterozygous calls (ref/(ref+alt))"),
|
||||
new VCFInfoHeaderLine("ABHom", 1, VCFHeaderLineType.Float, "Allele Balance for homozygous calls (A/(A+O)) where A is the allele (ref or alt) and O is anything other"),
|
||||
new VCFInfoHeaderLine("OND", 1, VCFHeaderLineType.Float, "Overall non-diploid ratio (alleles/(alleles+non-alleles))")); }
|
||||
}
|
||||
|
|
@ -55,11 +55,22 @@ import java.util.Set;
|
|||
/**
|
||||
* Allele balance per sample
|
||||
*
|
||||
* <p>The allele balance is the fraction of ref bases over ref + alt bases.</p>
|
||||
*
|
||||
* <p> This is an experimental annotation that attempts to estimate whether the data supporting a heterozygous genotype call fits allelic ratio expectations, or whether there might be some bias in the data.</p>
|
||||
* <h3>Calculation</h3>
|
||||
* <p> $$ AB = \frac{# ALT alleles}{total # alleles} $$ </p>
|
||||
* <p> Ideally, the value of AB should be close to 0.5, so half of the alleles support the ref allele and half of the alleles support the alt allele. Divergence from the expected ratio may indicate that there is some bias in favor of one allele. Note the caveats below regarding cancer and RNAseq analysis. </p>
|
||||
* <h3>Caveats</h3>
|
||||
* <p>Note that this annotation will only work properly for biallelic samples that are called as heterozygous.</p>
|
||||
* <h4>This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.</h4>
|
||||
* <ul>
|
||||
* <li>This annotation will only work properly for biallelic heterozygous calls.</li>
|
||||
* <li>This annotation cannot currently be calculated for indels.</li>
|
||||
* <li>tThe reasoning underlying this annotation only applies to germline variants in DNA sequencing data. In somatic/cancer analysis, divergent ratios are expected due to tumor heterogeneity. In RNAseq analysis, divergent ratios may indicate differential allele expression.</li>
|
||||
* <li>As stated above, this annotation is experimental and should be interpreted with caution as we cannot guarantee that it is appropriate. Basically, use it at your own risk.</li>
|
||||
* </ul>
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalance.php">AlleleBallance</a></b> is a generalization of this annotation over all samples.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> calculates depth of coverage for each allele per sample.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -44,8 +44,23 @@ import java.util.Map;
|
|||
|
||||
/**
|
||||
* Count of A, C, G, T bases across all samples
|
||||
*
|
||||
* <p> This annotation returns the counts of A, C, G, and T bases across all samples, in that order.</p>
|
||||
* <h3>Example:</h3>
|
||||
*
|
||||
* <pre>BaseCounts=3,0,3,0</pre>
|
||||
*
|
||||
* <p>
|
||||
* This means the number of A bases seen is 3, the number of T bases seen is 0, the number of G bases seen is 3, and the number of T bases seen is 0.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_NBaseCount.php">NBaseCount</a></b> counts the percentage of N bases.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class BaseCounts extends InfoFieldAnnotation {
|
||||
|
||||
public class BaseCounts extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
|
|||
|
|
@ -43,7 +43,19 @@ import java.util.Map;
|
|||
|
||||
|
||||
/**
|
||||
* Triplet annotation: fraction of MAQP == 0, MAPQ < 10, and count of all mapped reads
|
||||
* Proportion of low quality reads
|
||||
*
|
||||
* <p>This annotation tells you what fraction of reads have a mapping quality of less than the given threshold of 10 (including 0). Note that certain tools may impose a different minimum mapping quality threshold. For example, HaplotypeCaller excludes reads with MAPQ<20.</p>
|
||||
*
|
||||
* <h3>Calculation</h3>
|
||||
* <p> $$ LowMQ = \frac{# reads with MAPQ=0 + # reads with MAPQ<10}{total # reads} $$
|
||||
* </p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityZero.php">MappingQualityZero</a></b> gives the count of reads with MAPQ=0 across all samples.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityZeroBySample.php">MappingQualityZeroBySample</a></b> gives the count of reads with MAPQ=0 for each individual sample.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class LowMQ extends InfoFieldAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -44,7 +44,15 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Count for each sample of mapping quality zero reads
|
||||
* Count of reads with mapping quality zero for each sample
|
||||
*
|
||||
* <p>This annotation gives you the count of all reads that have MAPQ = 0 for each sample. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.</p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityZero.php">MappingQualityZero</a></b> gives the count of reads with MAPQ=0 across all samples.</li>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_LowMQ.php">LowMQ</a></b> gives the proportion of reads with low mapping quality (MAPQ below 10, including 0).</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class MappingQualityZeroBySample extends GenotypeAnnotation {
|
||||
public void annotate(final RefMetaDataTracker tracker,
|
||||
|
|
|
|||
|
|
@ -43,8 +43,18 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The number of N bases, counting only SOLiD data
|
||||
*/
|
||||
* Percentage of N bases
|
||||
*
|
||||
* <p>N occurs in a sequence when the sequencer does not have enough information to determine which base it should call. The presence of many Ns at the same site lowers our confidence in any calls made there, because it suggests that there was some kind of technical difficulty that interfered with the sequencing process.</p>
|
||||
*
|
||||
* <p><b>Note that in GATK versions 3.2 and earlier, this annotation only counted N bases from reads generated with SOLiD technology. This functionality was generalized for all sequencing platforms in GATK version 3.3.</b></p>
|
||||
*
|
||||
* <h3>Related annotations</h3>
|
||||
* <ul>
|
||||
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_BaseCounts.php">BaseCounts</a></b> counts the number of A, C, G, T bases across all samples.</li>
|
||||
* </ul>
|
||||
*
|
||||
* */
|
||||
public class NBaseCount extends InfoFieldAnnotation {
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
@ -55,27 +65,25 @@ public class NBaseCount extends InfoFieldAnnotation {
|
|||
if( stratifiedContexts.size() == 0 )
|
||||
return null;
|
||||
|
||||
int countNBaseSolid = 0;
|
||||
int countRegularBaseSolid = 0;
|
||||
int countNBase = 0;
|
||||
int countRegularBase = 0;
|
||||
|
||||
for( final AlignmentContext context : stratifiedContexts.values() ) {
|
||||
for( final PileupElement p : context.getBasePileup()) {
|
||||
final String platform = p.getRead().getReadGroup().getPlatform();
|
||||
if( platform != null && platform.toUpperCase().contains("SOLID") ) {
|
||||
if( BaseUtils.isNBase( p.getBase() ) ) {
|
||||
countNBaseSolid++;
|
||||
} else if( BaseUtils.isRegularBase( p.getBase() ) ) {
|
||||
countRegularBaseSolid++;
|
||||
}
|
||||
if( BaseUtils.isNBase( p.getBase() ) ) {
|
||||
countNBase++;
|
||||
} else if( BaseUtils.isRegularBase( p.getBase() ) ) {
|
||||
countRegularBase++;
|
||||
}
|
||||
}
|
||||
}
|
||||
final Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.4f", (double)countNBaseSolid / (double)(countNBaseSolid + countRegularBaseSolid + 1)));
|
||||
map.put(getKeyNames().get(0), String.format("%.4f", (double)countNBase / (double)(countNBase + countRegularBase + 1)));
|
||||
return map;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("PercentNBaseSolid"); }
|
||||
public List<String> getKeyNames() { return Arrays.asList("PercentNBase"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("PercentNBaseSolid", 1, VCFHeaderLineType.Float, "Percentage of N bases in the pileup (counting only SOLiD reads)")); }
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("PercentNBase", 1, VCFHeaderLineType.Float, "Percentage of N bases in the pileup")); }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,15 +45,14 @@ import java.util.*;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* A set of genomic annotations based on the output of the SnpEff variant effect predictor tool
|
||||
* Top effect from SnpEff functional predictions
|
||||
*
|
||||
* <p>See <a href='http://snpeff.sourceforge.net/'>http://snpeff.sourceforge.net/</a> for more information on the SnpEff tool</p>.
|
||||
* <p>This annotation processes the output of the SnpEff functional prediction tool to select only the predicted effect with the highest biological impact. The SnpEff output must be provided on the command line by specifying "--snpEffFile filename.vcf". See <a href="http://snpeff.sourceforge.net/">http://snpeff.sourceforge.net/</a> for more information about the SnpEff tool</p>.
|
||||
*
|
||||
* <p>For each variant, this tool chooses one of the effects of highest biological impact from the SnpEff
|
||||
* output file (which must be provided on the command line via --snpEffFile filename.vcf),
|
||||
* and adds annotations on that effect.</p>
|
||||
* <h3>Caveats</h3>
|
||||
*
|
||||
* <ul><li>This annotation currently only supports output from SnpEff version 2.0.5.</li></ul>
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotation {
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue