From 06d78ba0680bf3ba306d01680cef5e525ca26304 Mon Sep 17 00:00:00 2001
From: Laura Gauthier
Date: Fri, 4 Apr 2014 10:27:09 -0400
Subject: [PATCH] Expanded documentation to include description of which
callsets are being compared in what order and more definitions
---
.../variantutils/GenotypeConcordance.java | 104 ++++++++++++------
1 file changed, 69 insertions(+), 35 deletions(-)
diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java
index 08c938583..1bef3134a 100755
--- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java
+++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java
@@ -68,49 +68,62 @@ import java.util.*;
* Output
*
* Genotype Concordance writes a GATK report to the specified file (via -o), consisting of multiple tables of counts
- * and proportions. These tables are constructed on a per-sample basis, and include counts of EVAL vs COMP genotype states, and the
- * number of times the alternate alleles between the EVAL and COMP sample did not match up.
+ * and proportions. These tables are constructed on a per-sample basis, and include counts of EVAL vs COMP genotype
+ * states.
+ *
+ * Tables
+ *
+ * Headers for the (non-moltenized -- see below) GenotypeConcordance counts and proportions tables give the genotype of
+ * the COMP callset followed by the genotype of the EVAL callset. For example the value corresponding to HOM_REF_HET
+ * reflects variants called HOM_REF in the COMP callset and HET in the EVAL callset. Variants for which the alternate
+ * alleles between the EVAL and COMP sample did not match are excluded from genotype comparisons and given in the
+ * "Mismatching_Alleles" field.
+ *
+ *
+ * It may be informative to reshape rows of the GenotypeConcordance counts and proportions tables into separate row-major tables
+ * where the columns indicate the COMP genotype and the rows indicate the EVAL genotype for easy comparison between the
+ * two callsets. This can be done with a command similar to d <- matrix(sampleRow,nrow=6,byrow=T) in R where sampleRow is the 36-value row corresponding to the sample of interest, excluding "Mismatching_Alleles".
+ * In Excel this can be accomplished using the OFFSET function.
+ *
+ *
+ * - GenotypeConcordance_CompProportions: Gives the proportion of variants in each category normalized to the total number of called genotypes in the COMP callset
+ * - GenotypeConcordance_Counts: Gives the counts for number of genotypes in each category
+ * - GenotypeConcordance_EvalProportions: Gives the proportion of genotypes in each category normalized to the total number of called genotypes in the EVAL callset
+ * - GenotypeConcordance_Summary: Summary statistics for the sum of all samples and each sample individually. See below for definitions.
+ * - SiteConcordance_Summary: Gives comparison counts of called genotypes and their alleles between the two callsets. See below for definitions.
+ *
*
*
* Term and metrics definitions
*
*
- * - HET: heterozygous
- * - HOM_REF: homozygous reference
- * - HOM_VAR: homozygous variant
- * - MIXED: something like ./1
- * - ALLELES_MATCH: counts of calls at the same site where the alleles match
- * - ALLELES_DO_NOT_MATCH: counts of calls at the same location with different alleles, such as the eval set calling a 'G' alternate allele, and the comp set calling a 'T' alternate allele
- * - EVAL_ONLY: counts of sites present only in the EVAL set, not in the COMP set
- * - TRUTH_ONLY: counts of sites present only in the COMP set, not in the EVAL set
- * - Non-Reference_Discrepancy (NRD): genotype concordance excluding concordant reference sites
- * - Non-Reference_Sensitivity (NRS): sensitivity of the EVAL calls to polymorphic calls in the COMP set, calculated by (# true positive)/(# true polymorphic)
- * - Overall_Genotype_Concordance: overall concordance calculated by (# concordant genotypes)/(# genotypes)
+ * - GenotypeConcordance_CompProportions, GenotypeConcordance_Counts, and GenotypeConcordance_EvalProportions
+ *
+ * - NO_CALL: reported genotype is ./., indicating not enough data to call
+ * - HET: heterozygous
+ * - HOM_REF: homozygous reference
+ * - HOM_VAR: homozygous variant
+ * - UNAVAILABLE: variant is not called in this callset
+ * - MIXED: something like ./1
+ *
+ * - GenotypeConcordance_Summary
+ *
+ * - Non-Reference_Sensitivity (NRS): sensitivity of the EVAL calls to polymorphic calls in the COMP set, calculated by (# true positive)/(# true polymorphic)
+ * - Non-Reference_Discrepancy (NRD): genotype discordance excluding concordant reference sites, calculated by (# discordant sites)/(total excluding # HOM_REF_HOM_REF) = 1.0-(# HOM_VAR_HOM_VAR + # HET_HET)/(total excluding # HOM_REF_HOM_REF)
+ * - Overall_Genotype_Concordance: overall concordance calculated by (# concordant genotypes)/(# genotypes)
+ *
+ * - SiteConcordance_Summary
+ *
+ * - ALLELES_MATCH: counts of calls at the same site where the alleles match
+ * - ALLELES_DO_NOT_MATCH: counts of calls at the same location with different alleles, such as the EVAL set calling a 'G' alternate allele, and the comp set calling a 'T' alternate allele
+ * - EVAL_SUBSET_TRUTH: (multi-alleleic sites only) ALT alleles for EVAL are a subset of ALT alleles for COMP. See also below.
+ * - EVAL_SUPERSET_TRUTH: (multi-allelic sites only) ALT alleles for COMP are a subset of ALT alleles for EVAL. See also below.
+ * - EVAL_ONLY: counts of sites present only in the EVAL set, not in the COMP set
+ * - TRUTH_ONLY: counts of sites present only in the COMP set, not in the EVAL set
+ *
*
*
*
- * Moltenized tables
- *
- * These tables may be optionally moltenized via the -moltenize argument. That is, the standard table
- *
- *
- * Sample NO_CALL_HOM_REF NO_CALL_HET NO_CALL_HOM_VAR (...)
- * NA12878 0.003 0.001 0.000 (...)
- * NA12891 0.005 0.000 0.000 (...)
- *
- *
- * would instead be displayed
- *
- *
- * NA12878 NO_CALL_HOM_REF 0.003
- * NA12878 NO_CALL_HET 0.001
- * NA12878 NO_CALL_HOM_VAR 0.000
- * NA12891 NO_CALL_HOM_REF 0.005
- * NA12891 NO_CALL_HET 0.000
- * NA12891 NO_CALL_HOM_VAR 0.000
- * (...)
- *
- *
* Site-level allelic concordance
*
*
@@ -158,6 +171,27 @@ import java.util.*;
* in which case all records are used. There is currently no way to assess concordance metrics on filtered sites
* exclusively. SelectVariants can be used to extract filtered sites, and VariantFiltration used to un-filter them.
*
+ *
Moltenized tables
+ *
+ * These tables may be optionally moltenized via the -moltenize argument. That is, the standard table
+ *
+ *
+ * Sample NO_CALL_HOM_REF NO_CALL_HET NO_CALL_HOM_VAR (...)
+ * NA12878 0.003 0.001 0.000 (...)
+ * NA12891 0.005 0.000 0.000 (...)
+ *
+ *
+ * would instead be displayed
+ *
+ *
+ * NA12878 NO_CALL_HOM_REF 0.003
+ * NA12878 NO_CALL_HET 0.001
+ * NA12878 NO_CALL_HOM_VAR 0.000
+ * NA12891 NO_CALL_HOM_REF 0.005
+ * NA12891 NO_CALL_HET 0.000
+ * NA12891 NO_CALL_HOM_VAR 0.000
+ * (...)
+ *
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )