From 4450d66c647f949da89e9590d2c1e336e82da742 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Mon, 20 Aug 2012 15:10:24 -0400
Subject: [PATCH] Fixing the docs for DP and AD

---
 .../gatk/walkers/annotator/DepthOfCoverage.java | 17 ++++-------------
 .../annotator/DepthPerAlleleBySample.java       | 12 ++++++------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
index 28ca77f18..39b5e84dc 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
@@ -22,19 +22,10 @@ import java.util.Map;
 /**
  * Total (unfiltered) depth over all samples.
  *
- * This and AD are complementary fields that are two important ways of thinking about the depth of the data for this sample
- * at this site.  The DP field describe the total depth of reads that passed the Unified Genotypers internal
- * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
- * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
- * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
- * power I have to determine the genotype of the sample at this site, while the AD tells me how many times
- * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
- * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
- * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
- * normally be excluded from the statistical calculations going into GQ and QUAL.
- *
- * Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with
- * -dcov D is N * D
+ * While the sample-level (FORMAT) DP field describes the total depth of reads that passed the Unified Genotyper's
+ * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth
+ * over all samples.  Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for
+ * N samples with -dcov D is N * D
  */
 public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
index a9edab752..5d83ddd51 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
@@ -24,10 +24,10 @@ import java.util.List;
 /**
  * The depth of coverage of each VCF allele in this sample.
  *
- * This and DP are complementary fields that are two important ways of thinking about the depth of the data for this sample
- * at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
- * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
- * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
+ * The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this
+ * sample at this site.  While the sample-level (FORMAT) DP field describes the total depth of reads that passed the
+ * Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of
+ * REF and ALT fields) is the unfiltered count of all reads that carried with them the
  * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
  * power I have to determine the genotype of the sample at this site, while the AD tells me how many times
  * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
@@ -35,10 +35,10 @@ import java.util.List;
  * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
  * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
  * the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that
- * are actually present and correctly left-aligned in the alignments themselves). Because of this fact and
+ * are unambiguously informative about the alternate allele). Because of this fact and
  * because the AD includes reads and bases that were filtered by the Unified Genotyper, <b>one should not base
  * assumptions about the underlying genotype based on it</b>; instead, the genotype likelihoods (PLs) are what
- * determine the genotype calls (see below).
+ * determine the genotype calls.
  */
 public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {