From 2e35592295267f9832647a43658e6972a290289c Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Wed, 17 Aug 2011 16:32:01 -0400
Subject: [PATCH 01/13] GATKDocs for CallableLoci

---
 .../walkers/coverage/CallableLociWalker.java  | 185 +++++++++++++++---
 1 file changed, 163 insertions(+), 22 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
index 90e6fcd77..3046ef925 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
@@ -42,50 +42,190 @@ import java.io.PrintStream;
 /**
  * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome
  *
- * @Author depristo
- * @Date May 7, 2010
+ * <p>
+ * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system
+ * considers the coverage at each locus and emits either a per base state or a summary interval BED file that
+ * partitions the genomic intervals into the following callable states:
+ * <dl>
+ *     <dt>REF_N</dt>
+ *        <dd>the reference base was an N, which is not considered callable the GATK</dd>
+ *     <dt>CALLABLE</dt>
+ *        <dd>the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE</dd>
+ *     <dt>NO_COVERAGE</dt>
+ *        <dd>absolutely no reads were seen at this locus, regardless of the filtering parameters</dd>
+ *     <dt>LOW_COVERAGE</dt>
+ *        <dd>there were less than min. depth bases at the locus, after applying filters</dd>
+ *     <dt>EXCESSIVE_COVERAGE</dt>
+ *        <dd>more than -maxDepth read at the locus, indicating some sort of mapping problem</dd>
+ *     <dt>POOR_MAPPING_QUALITY</dt>
+ *        <dd>more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads</dd>
+ * </dl>
+ * </p>
+ *
+ * <h2>Input</h2>
+ * <p>
+ *     A BAM file containing <b>exactly one sample</b>.
+ * </p>
+ *
+ * <h2>Output</h2>
+ * <p>
+ * <ul>
+ *     <li>-o: a OutputFormatted (recommended BED) file with the callable status covering each base</li>
+ *     <li>-summary: a table of callable status x count of all examined bases</li>
+ * </ul>
+ * </p>
+ *
+ * <h2>Examples</h2>
+ * <pre>
+ *     -T CallableLociWalker \
+ *     -I my.bam \
+ *     -summary my.summary \
+ *     -o my.bed
+ * </pre>
+ *
+ * would produce a BED file (my.bed) that looks like:
+ *
+ * <pre>
+ *     20 10000000 10000864 CALLABLE
+ *     20 10000865 10000985 POOR_MAPPING_QUALITY
+ *     20 10000986 10001138 CALLABLE
+ *     20 10001139 10001254 POOR_MAPPING_QUALITY
+ *     20 10001255 10012255 CALLABLE
+ *     20 10012256 10012259 POOR_MAPPING_QUALITY
+ *     20 10012260 10012263 CALLABLE
+ *     20 10012264 10012328 POOR_MAPPING_QUALITY
+ *     20 10012329 10012550 CALLABLE
+ *     20 10012551 10012551 LOW_COVERAGE
+ *     20 10012552 10012554 CALLABLE
+ *     20 10012555 10012557 LOW_COVERAGE
+ *     20 10012558 10012558 CALLABLE
+ *     et cetera...
+ * </pre>
+ * as well as a summary table that looks like:
+ *
+ * <pre>
+ *                        state nBases
+ *                        REF_N 0
+ *                     CALLABLE 996046
+ *                  NO_COVERAGE 121
+ *                 LOW_COVERAGE 928
+ *           EXCESSIVE_COVERAGE 0
+ *         POOR_MAPPING_QUALITY 2906
+ * </pre>
+ *
+ * @author Mark DePristo
+ * @since May 7, 2010
  */
 @By(DataSource.REFERENCE)
 public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableBaseState, CallableLociWalker.Integrator> {
     @Output
     PrintStream out;
 
-    @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.  The gap between this value and mmq are reads that are not sufficiently well mapped for calling but aren't indicative of mapping problems.", required = false)
+    /**
+     * Callable loci summary counts (see outputs) will be written to this file.
+     */
+    @Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
+    File summaryFile;
+
+    /**
+     * The gap between this value and mmq are reads that are not sufficiently well mapped for calling but
+     * aren't indicative of mapping problems.  For example, if maxLowMAPQ = 1 and mmq = 20, then reads with
+     * MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where
+     * reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to
+     * calling.  In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states
+     */
+    @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false)
     byte maxLowMAPQ = 1;
 
-    @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
+    /**
+     * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE
+     * state.
+     */
+    @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
     byte minMappingQuality = 10;
 
-    @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
+    /**
+     * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state
+     */
+    @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
     byte minBaseQuality = 20;
 
+    /**
+     * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
+     * value and is less than maxDepth the site is considered CALLABLE.
+     */
     @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
     int minDepth = 4;
 
+    /**
+     * If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH
+     */
     @Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
     int maxDepth = -1;
 
+    /**
+     * We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ.  We
+     * won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads
+     * covering the site.
+     */
     @Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false)
     int minDepthLowMAPQ = 10;
 
-    @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
+    /**
+     * If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality
+     * exceeds this fraction then the site has POOR_MAPPING_QUALITY.
+     */
+    @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false)
     double maxLowMAPQFraction = 0.1;
 
-    @Argument(fullName = "format", shortName = "format", doc = "Output format for the system: either BED or STATE_PER_BASE", required = false)
+    /**
+     * The output of this walker will be written in this format.  The recommended option is BED.
+     */
+    @Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
     OutputFormat outputFormat;
 
-    @Argument(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
-    File summaryFile;
+    public enum OutputFormat {
+        /**
+         * The output will be written as a BED file.  There's a BED element for each
+         * continuous run of callable states (i.e., CALLABLE, REF_N, etc).  This is the recommended
+         * format
+         */
+        BED,
 
-    public enum OutputFormat { BED, STATE_PER_BASE }
+        /**
+         * Emit chr start stop state quads for each base.  Produces a potentially disasterously
+         * large amount of output.
+         */
+        STATE_PER_BASE
+    }
+
+    public enum CalledState {
+        /** the reference base was an N, which is not considered callable the GATK */
+        REF_N,
+        /** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
+        CALLABLE,
+        /** absolutely no reads were seen at this locus, regardless of the filtering parameters */
+        NO_COVERAGE,
+        /** there were less than min. depth bases at the locus, after applying filters */
+        LOW_COVERAGE,
+        /** more than -maxDepth read at the locus, indicating some sort of mapping problem */
+        EXCESSIVE_COVERAGE,
+        /** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
+        POOR_MAPPING_QUALITY
+    }
 
     ////////////////////////////////////////////////////////////////////////////////////
     // STANDARD WALKER METHODS
     ////////////////////////////////////////////////////////////////////////////////////
 
+    @Override
     public boolean includeReadsWithDeletionAtLoci() { return true; }
 
+    @Override
     public void initialize() {
+        if ( getToolkit().getSamples().size() > 1 )
+            throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getToolkit().getSamples());
+
         try {
             PrintStream summaryOut = new PrintStream(summaryFile);
             summaryOut.close();
@@ -94,15 +234,15 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
         }
     }
 
-    public static class Integrator {
-        long counts[] = new long[CalledState.values().length];
+    protected static class Integrator {
+        final long counts[] = new long[CalledState.values().length];
         CallableBaseState state = null;
     }
 
-    public static class CallableBaseState implements HasGenomeLocation {
-        public GenomeLocParser genomeLocParser;
+    protected static class CallableBaseState implements HasGenomeLocation {
+        final public GenomeLocParser genomeLocParser;
         public GenomeLoc loc;
-        public CalledState state;
+        final public CalledState state;
 
         public CallableBaseState(GenomeLocParser genomeLocParser,GenomeLoc loc, CalledState state) {
             this.genomeLocParser = genomeLocParser;
@@ -133,16 +273,10 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
 
         public String toString() {
             return String.format("%s %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), state);
-            //return String.format("%s %d %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), loc.getStop() - loc.getStart() + 1, state);
         }
     }
 
-    public enum CalledState { REF_N, CALLABLE, NO_COVERAGE, LOW_COVERAGE, EXCESSIVE_COVERAGE, POOR_MAPPING_QUALITY }
-
-    public Integrator reduceInit() {
-        return new Integrator();
-    }
-
+    @Override
     public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
         CalledState state;
 
@@ -179,6 +313,12 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
         return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state);
     }
 
+    @Override
+    public Integrator reduceInit() {
+        return new Integrator();
+    }
+
+    @Override
     public Integrator reduce(CallableBaseState state, Integrator integrator) {
         // update counts
         integrator.counts[state.getState().ordinal()]++;
@@ -206,6 +346,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
     // INTERVAL ON TRAVERSAL DONE
     ////////////////////////////////////////////////////////////////////////////////////
 
+    @Override
     public void onTraversalDone(Integrator result) {
         // print out the last state
         if ( result != null ) {

From 81a792afeb6f6bcdf81437a58417217924aeb61b Mon Sep 17 00:00:00 2001
From: Matt Hanna <hanna@broadinstitute.org>
Date: Wed, 17 Aug 2011 16:58:24 -0400
Subject: [PATCH 02/13] Reverting optimization disable in unstable.

---
 .../gatk/datasources/reads/LowMemoryIntervalSharder.java      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
index 198f7d7d3..ba6321121 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
@@ -59,8 +59,8 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
      */
     public FilePointer next() {
         FilePointer current = wrappedIterator.next();
-        //while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
-        //    current = current.combine(parser,wrappedIterator.next());
+        while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
+            current = current.combine(parser,wrappedIterator.next());
         return current;
     }
 

From 8e83b6646ba9934c8c9475538e984d684b2d37eb Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 17 Aug 2011 21:49:14 -0400
Subject: [PATCH 03/13] Bug fix for Chris: don't validate ref base for complex
 events.

---
 .../sting/utils/variantcontext/VariantContext.java            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index ca3399c78..dacc2d94a 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -1080,8 +1080,8 @@ public class VariantContext implements Feature { // to enable tribble intergrati
     }
 
     public void validateReferenceBases(Allele reference, Byte paddedRefBase) {
-        // don't validate if we're an insertion
-        if ( !reference.isNull() && !reference.basesMatch(getReference()) ) {
+        // don't validate if we're an insertion or complex event
+        if ( !reference.isNull() && getReference().length() == 1 && !reference.basesMatch(getReference()) ) {
             throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
         }
 

From a9df3653642da21106d8c9d86cc88300a8323d1f Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Wed, 17 Aug 2011 17:48:04 -0400
Subject: [PATCH 04/13] GenotypeAndValidate walker updated

Updated the walker to comply with the new RodBinding system and the new GATKDocs. Will move it to public after writing integration tests.
---
 settings/helpTemplates/style.css | 48 ++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/settings/helpTemplates/style.css b/settings/helpTemplates/style.css
index 1d7bcc576..2abcf0397 100644
--- a/settings/helpTemplates/style.css
+++ b/settings/helpTemplates/style.css
@@ -95,6 +95,12 @@ dd {
     padding: 0 0 0.5em 0;
 }
 
+pre {
+    border: thin solid lightgray;
+    margin-left: 1em;
+    margin-right: 4em;
+    background-color: #e0fdff;
+}
 /*
  * clean table layouts
 */
@@ -128,6 +134,48 @@ dd {
 }
 
 th#row-divider 
+{
+    font-weight: bolder;
+	font-size: larger;
+}
+
+
+/*
+ *  Table design for input/ouptut description
+ */
+
+#description-table
+{
+	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
+	font-size: 12px;
+	background: #fff;
+	margin: 5px;
+	border-collapse: collapse;
+	text-align: left;
+}
+#description-table th
+{
+	font-size: 16px;
+	font-weight: bold;
+    background-color: lightgray;
+	color: #039;
+    text-align: center;
+	padding: 10px 8px;
+	border-bottom: 2px solid #6678b1;
+}
+#description-table td
+{
+	border-bottom: 1px solid #ccc;
+	color: #669;
+	padding: 6px 8px;
+    text-align: right;
+}
+#description-table tbody tr:hover td
+{
+	color: #009;
+}
+
+th#row-divider
 {
     font-weight: bolder;
 	font-size: larger;

From cc3df8f11a454cdabdd139b4e52f70447ce5756d Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Wed, 17 Aug 2011 21:54:40 -0400
Subject: [PATCH 05/13] Moving GAV walker to public

Walker is updated to the new RodBinding system and has the new GATKDocs layout.
---
 .../validation/GenotypeAndValidateWalker.java | 438 ++++++++++++++++++
 1 file changed, 438 insertions(+)
 create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
new file mode 100755
index 000000000..fc23200af
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.validation;
+
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
+import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
+
+import java.util.Map;
+import java.util.Set;
+
+import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel;
+
+/**
+ * Genotypes a dataset and validates the calls of another dataset using the Unified Genotyper.
+ *
+ *  <p>
+ *     Genotype and Validate is a tool to evaluate the quality of a dataset for calling SNPs
+ *     and Indels given a secondary (validation) data source. The data sources are BAM or VCF
+ *     files. You can use them interchangeably (i.e. a BAM to validate calls in a VCF or a VCF
+ *     to validate calls on a BAM).
+ *  </p>
+ *
+ *  <p>
+ *     The simplest scenario is when you have a VCF of hand annotated SNPs and Indels, and you
+ *     want to know how well a particular technology performs calling these snps. With a
+ *     dataset (BAM file) generated by the technology in test, and the hand annotated VCF, you
+ *     can run GenotypeAndValidate to asses the accuracy of the calls with the new technology's
+ *     dataset.
+ *  </p>
+ *
+ *  <p>
+ *     Another option is to validate the calls on a VCF file, using a deep coverage BAM file
+ *     that you trust the calls on. The GenotypeAndValidate walker will make calls using the
+ *     reads in the BAM file and take them as truth, then compare to the calls in the VCF file
+ *     and produce a truth table.
+ *  </p>
+ *
+ *
+ * <h2>Input</h2>
+ *  <p>
+ *      A BAM file to make calls on and a VCF file to use as truth validation dataset.
+ *
+ *      You also have the option to invert the roles of the files using the command line options listed below.
+ *  </p>
+ *
+ * <h2>Output</h2>
+ *  <p>
+ *      GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a
+ *      2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true
+ *      positive or a false positive). The table should look like this:
+ *  </p>
+ * <center>
+ * <table id="description-table">
+ *     <tr>
+ *         <th></th>
+ *         <th>ALT</th>
+ *         <th>REF</th>
+ *         <th>Predictive Value</th>
+ *     </tr>
+ *     <tr>
+ *         <td><b>called alt</b></td>
+ *         <td>True Positive (TP)</td>
+ *         <td>False Positive (FP)</td>
+ *         <td>Positive PV</td>
+ *     </tr>
+ *     <tr>
+ *         <td><b>called ref</b></td>
+ *         <td>False Negative (FN)</td>
+ *         <td>True Negative (TN)</td>
+ *         <td>Negative PV</td>
+ *     </tr>
+ * </table>
+ * </center>
+ *
+ *  <p>
+ *      The <b>positive predictive value (PPV)</b> is the proportion of subjects with positive test results
+ *      who are correctly diagnosed.
+ *  </p>
+ *  <p>
+ *      The <b>negative predictive value (NPV)</b> is the proportion of subjects with a negative test result
+ *      who are correctly diagnosed.
+ *  </p>
+ *  <p>
+ *      The VCF file will contain only the variants that were called or not called, excluding the ones that
+ *      were uncovered or didn't pass the filters. This file is useful if you are trying to compare
+ *      the PPV and NPV of two different technologies on the exact same sites (so you can compare apples to
+ *      apples).
+ *  </p>
+ *
+ *  <p>
+ *      Here is an example of an annotated VCF file (info field clipped for clarity)
+ *
+ * <pre>
+ * #CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA12878
+ * 1   20568807    .   C   T   0    HapMapHet        AC=1;AF=0.50;AN=2;DP=0;GV=T  GT  0/1
+ * 1   22359922    .   T   C   282  WG-CG-HiSeq      AC=2;AF=0.50;GV=T;AN=4;DP=42 GT:AD:DP:GL:GQ  1/0 ./. 0/1:20,22:39:-72.79,-11.75,-67.94:99    ./.
+ * 13  102391461   .   G   A   341  Indel;SnpCluster AC=1;GV=F;AF=0.50;AN=2;DP=45 GT:AD:DP:GL:GQ  ./. ./. 0/1:32,13:45:-50.99,-13.56,-112.17:99   ./.
+ * 1   175516757   .   C   G   655  SnpCluster,WG    AC=1;AF=0.50;AN=2;GV=F;DP=74 GT:AD:DP:GL:GQ  ./. ./. 0/1:52,22:67:-89.02,-20.20,-191.27:99   ./.
+ * </pre>
+ *
+ *  </p>
+ *
+ *  <h3>Additional Details</h3>
+ *  <ul>
+ *      <li>
+ *          You should always use -BTI on your VCF track, so that the GATK only looks at the sites on the VCF file.
+ *          This speeds up the process a lot.
+ *      </li>
+ *      <li>
+ *          The total number of visited bases may be greater than the number of variants in the original
+ *          VCF file because of extended indels, as they trigger one call per new insertion or deletion.
+ *          (i.e. ACTG/- will count as 4 genotyper calls, but it's only one line in the VCF).
+ *      </li>
+ *  </ul>
+ *
+ * <h2>Examples</h2>
+ * <ol>
+ *     <li>
+ *         Genotypes BAM file from new technology using the VCF as a truth dataset:
+ *     </li>
+ *
+ * <pre>
+ *  java
+ *      -jar /GenomeAnalysisTK.jar
+ *      -T  GenotypeAndValidate
+ *      -R human_g1k_v37.fasta
+ *      -I myNewTechReads.bam
+ *      -alleles handAnnotatedVCF.vcf
+ *      -BTI alleles
+ * </pre>
+ *
+ *      <li>
+ *          Using a BAM file as the truth dataset:
+ *      </li>
+ *
+ * <pre>
+ *  java
+ *      -jar /GenomeAnalysisTK.jar
+ *      -T  GenotypeAndValidate
+ *      -R human_g1k_v37.fasta
+ *      -I myTruthDataset.bam
+ *      -alleles callsToValidate.vcf
+ *      -BTI alleles
+ *      -bt
+ *      -o gav.vcf
+ * </pre>
+ *
+ *
+ * @author Mauricio Carneiro
+ * @since ${DATE}
+ */
+
+@Requires(value={DataSource.READS, DataSource.REFERENCE})
+@Allows(value={DataSource.READS, DataSource.REFERENCE})
+
+@By(DataSource.REFERENCE)
+@Reference(window=@Window(start=-200,stop=200))
+
+
+public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalker.CountedData, GenotypeAndValidateWalker.CountedData> implements TreeReducible<GenotypeAndValidateWalker.CountedData> {
+
+    @Output(doc="Generate a VCF file with the variants considered by the walker, with a new annotation \"callStatus\" which will carry the value called in the validation VCF or BAM file", required=false)
+    protected VCFWriter vcfWriter = null;
+
+    @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype", required=true)
+    public RodBinding<VariantContext> alleles;
+
+    @Argument(fullName ="set_bam_truth", shortName ="bt", doc="Use the calls on the reads (bam file) as the truth dataset and validate the calls on the VCF", required=false)
+    private boolean bamIsTruth = false;
+
+    @Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false)
+    private int mbq = -1;
+
+    @Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false)
+    private double deletions = -1;
+
+    @Argument(fullName="standard_min_confidence_threshold_for_calling", shortName="stand_call_conf", doc="the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls", required=false)
+    private double callConf = -1;
+
+    @Argument(fullName="standard_min_confidence_threshold_for_emitting", shortName="stand_emit_conf", doc="the minimum phred-scaled Qscore threshold to emit low confidence calls", required=false)
+    private double emitConf = -1;
+
+    @Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false)
+    private int minDepth = -1;
+
+    @Argument(fullName ="sample", shortName ="sn", doc="Name of the sample to validate (in case your VCF/BAM has more than one sample)", required=false)
+    private String sample = "";
+
+    private UnifiedGenotyperEngine snpEngine;
+    private UnifiedGenotyperEngine indelEngine;
+
+    public static class CountedData {
+        private long nAltCalledAlt = 0L;
+        private long nAltCalledRef = 0L;
+        private long nRefCalledAlt = 0L;
+        private long nRefCalledRef = 0L;
+        private long nNotConfidentCalls = 0L;
+        private long nUncovered = 0L;
+
+        /**
+         * Adds the values of other to this, returning this
+         * @param other the other object
+         */
+        public void add(CountedData other) {
+            nAltCalledAlt += other.nAltCalledAlt;
+            nAltCalledRef += other.nAltCalledRef;
+            nRefCalledAlt += other.nRefCalledAlt;
+            nRefCalledRef += other.nRefCalledRef;
+            nUncovered += other.nUncovered;
+            nNotConfidentCalls += other.nNotConfidentCalls;
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // initialize
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public void initialize() {
+
+        // Initialize VCF header
+        if (vcfWriter != null) {
+            Map<String, VCFHeader> header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName());
+            Set<String> samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
+            Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(header.values(), logger);
+            headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate"));
+            vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
+        }
+
+        // Filling in SNP calling arguments for UG
+        UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
+        uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
+        uac.alleles = alleles;
+        if (!bamIsTruth) uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
+        if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq;
+        if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions;
+        if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf;
+        if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf;
+
+        uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
+        snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
+
+        // Adding the INDEL calling arguments for UG
+        uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL;
+        indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
+
+        // make sure we have callConf set to the threshold set by the UAC so we can use it later.
+        callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING;
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // map
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
+
+        final CountedData counter = new CountedData();
+
+        // For some reason RodWalkers get map calls with null trackers
+        if( tracker == null )
+            return counter;
+
+        VariantContext vcComp = tracker.getFirstValue(alleles);
+        if( vcComp == null )
+            return counter;
+
+        //todo - not sure I want this, may be misleading to filter extended indel events.
+        if (isInsideExtendedIndel(vcComp,  ref))
+            return counter;
+
+        // Do not operate on variants that are not covered to the optional minimum depth
+        if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) {
+            counter.nUncovered = 1L;
+            return counter;
+        }
+
+        VariantCallContext call;
+        if ( vcComp.isSNP() )
+            call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
+        else if ( vcComp.isIndel() ) {
+            call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
+        }
+        else {
+            logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles());
+            return counter;
+        }
+
+
+        boolean writeVariant = true;
+
+        if (bamIsTruth) {
+            if (call.confidentlyCalled) {
+                // If truth is a confident REF call
+                if (call.isVariant()) {
+                    if (vcComp.isVariant())
+                        counter.nAltCalledAlt = 1L;  // todo -- may wanna check if the alts called are the same?
+                    else
+                        counter.nAltCalledRef = 1L;
+                }
+                // If truth is a confident ALT call
+                else {
+                    if (vcComp.isVariant())
+                        counter.nRefCalledAlt = 1L;
+                    else
+                        counter.nRefCalledRef = 1L;
+                }
+            }
+            else {
+                counter.nNotConfidentCalls = 1L;
+                writeVariant = false;
+            }
+        }
+        else {
+            if (!vcComp.hasAttribute("GV"))
+                throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart());
+
+
+
+            if (call.isCalledAlt(callConf)) {
+                if (vcComp.getAttribute("GV").equals("T"))
+                    counter.nAltCalledAlt = 1L;
+                else
+                    counter.nRefCalledAlt = 1L;
+            }
+            else if (call.isCalledRef(callConf)) {
+                if (vcComp.getAttribute("GV").equals("T"))
+                    counter.nAltCalledRef = 1L;
+                else
+                    counter.nRefCalledRef = 1L;
+            }
+            else {
+                counter.nNotConfidentCalls = 1L;
+                writeVariant = false;
+            }
+        }
+
+        if (vcfWriter != null && writeVariant) {
+            if (!vcComp.hasAttribute("callStatus")) {
+                MutableVariantContext mvc = new MutableVariantContext(vcComp);
+                mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" );
+                vcfWriter.add(mvc);
+            }
+            else
+                vcfWriter.add(vcComp);
+        }
+        return counter;
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // reduce
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public CountedData reduceInit() {
+        return new CountedData();
+    }
+
+    public CountedData treeReduce( final CountedData sum1, final CountedData sum2) {
+        sum2.add(sum1);
+        return sum2;
+    }
+
+    public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) {
+        reduceSum.add(mapValue);
+        return reduceSum;
+    }
+
+    public void onTraversalDone( CountedData reduceSum ) {
+        double ppv = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nRefCalledAlt));
+        double npv = 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nAltCalledRef));
+        double sensitivity = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nAltCalledRef));
+        double specificity = (reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt > 0) ? 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt)) : 100;
+        logger.info(String.format("Resulting Truth Table Output\n\n" +
+                                  "---------------------------------------------------\n" +
+                                  "\t\t|\tALT\t|\tREF\t\n"  +
+                                  "---------------------------------------------------\n" +
+                                  "called alt\t|\t%d\t|\t%d\n" +
+                                  "called ref\t|\t%d\t|\t%d\n" +
+                                  "---------------------------------------------------\n" +
+                                  "positive predictive value: %f%%\n" +
+                                  "negative predictive value: %f%%\n" +
+                                  "---------------------------------------------------\n" +
+                                  "sensitivity: %f%%\n" +
+                                  "specificity: %f%%\n" +
+                                  "---------------------------------------------------\n" +
+                                  "not confident: %d\n" +
+                                  "not covered: %d\n" +
+                                  "---------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered));
+    }
+}

From a7b70e6bb46c10c3a00bcedfcf9cf916321e9d3b Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 17 Aug 2011 22:28:22 -0400
Subject: [PATCH 06/13] Adding feature for Khalid: ability to exclude
 particular samples.

---
 .../walkers/variantutils/SelectVariants.java  | 39 ++++++++++++++-----
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index b52863c8f..f6b6a8d65 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -191,16 +191,28 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
     protected VCFWriter vcfWriter = null;
 
     @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
-    public Set<String> sampleNames;
+    public Set<String> sampleNames = new HashSet<String>(0);
 
     @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
-    public Set<String> sampleExpressions;
+    public Set<String> sampleExpressions ;
 
     @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
     public Set<File> sampleFiles;
 
     /**
-     * Note that thse expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated.
+     * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
+     */
+    @Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false)
+    public Set<String> XLsampleNames = new HashSet<String>(0);
+
+    /**
+     * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
+     */
+    @Argument(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false)
+    public Set<File> XLsampleFiles;
+
+    /**
+     * Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated.
      */
     @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false)
     public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<String>();
@@ -304,8 +316,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
     private ArrayList<Double> afBoosts = null;
     double bkDelta = 0.0;
 
-
-        private PrintStream outMVFileStream = null;
+    private PrintStream outMVFileStream = null;
 
 
     /**
@@ -321,19 +332,27 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
         Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
         Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);
 
+        // first, add any requested samples
         samples.addAll(samplesFromFile);
         samples.addAll(samplesFromExpressions);
-        if (sampleNames != null)
-            samples.addAll(sampleNames);
+        samples.addAll(sampleNames);
 
-        if(samples.isEmpty()) {
+        // if none were requested, we want all of them
+        if ( samples.isEmpty() ) {
             samples.addAll(vcfSamples);
             NO_SAMPLES_SPECIFIED = true;
         }
 
-        for (String sample : samples) {
+        // now, exclude any requested samples
+        Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
+        samples.removeAll(XLsamplesFromFile);
+        samples.removeAll(XLsampleNames);
+
+        if ( samples.size() == 0 )
+            throw new UserException("All samples requested to be included were also requested to be excluded.");
+
+        for ( String sample : samples )
             logger.info("Including sample '" + sample + "'");
-        }
 
         // Initialize VCF header
         Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);

From b75a1807e3242fd3dda158aba33362f826fb9041 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 17 Aug 2011 22:40:09 -0400
Subject: [PATCH 07/13] Adding integration test to cover sample exclusion

---
 .../SelectVariantsIntegrationTest.java        | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
index bec0d5dd4..cec377f5f 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
@@ -7,7 +7,7 @@ import java.util.Arrays;
 
 public class SelectVariantsIntegrationTest extends WalkerTest {
     public static String baseTestString(String args) {
-        return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s" + args;
+        return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s -NO_HEADER" + args;
     }
 
     @Test
@@ -16,7 +16,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
         String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
 
         WalkerTestSpec spec = new WalkerTestSpec(
-            baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile + " -NO_HEADER"),
+            baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile),
             1,
             Arrays.asList("d18516c1963802e92cb9e425c0b75fd6")
         );
@@ -24,12 +24,26 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
         executeTest("testComplexSelection--" + testfile, spec);
     }
 
+    @Test
+    public void testSampleExclusion() {
+        String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
+        String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
+
+        WalkerTestSpec spec = new WalkerTestSpec(
+            "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant:VCF3 " + testfile,
+            1,
+            Arrays.asList("730f021fd6ecf1d195dabbee2e233bfd")
+        );
+
+        executeTest("testSampleExclusion--" + testfile, spec);
+    }
+
     @Test
     public void testRepeatedLineSelection() {
         String testfile = validationDataLocation + "test.dup.vcf";
 
         WalkerTestSpec spec = new WalkerTestSpec(
-                baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile + " -NO_HEADER"),
+                baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile),
                 1,
                 Arrays.asList("b74038779fe6485dbb8734ae48178356")
         );

From c30e1db744052c789de6726c259678509f6a91e8 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Thu, 18 Aug 2011 09:38:51 -0400
Subject: [PATCH 09/13] Better location for help utils

---
 .../sting/gatk/CommandLineGATK.java           |  7 +--
 .../help/DocumentedGATKFeatureHandler.java    |  2 +-
 .../sting/utils/help/GATKDocUtils.java        | 48 -------------------
 .../sting/utils/help/HelpUtils.java           | 17 +++++++
 4 files changed, 20 insertions(+), 54 deletions(-)
 delete mode 100644 public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java

diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index 7e96b609e..7c567f511 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -34,10 +34,7 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter;
 import org.broadinstitute.sting.gatk.walkers.Attribution;
 import org.broadinstitute.sting.gatk.walkers.Walker;
 import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.help.ApplicationDetails;
-import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.help.GATKDocUtils;
-import org.broadinstitute.sting.utils.help.GATKDoclet;
+import org.broadinstitute.sting.utils.help.*;
 import org.broadinstitute.sting.utils.text.TextFormattingUtils;
 
 import java.util.*;
@@ -178,7 +175,7 @@ public class CommandLineGATK extends CommandLineExecutable {
         Formatter formatter = new Formatter(additionalHelp);
 
         formatter.format("For a full description of this walker, see its GATKdocs at:%n");
-        formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType));
+        formatter.format("%s%n", HelpUtils.helpLinksToGATKDocs(walkerType));
 
         return additionalHelp.toString();
     }
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java
index ce03c8093..214a1716a 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java
@@ -71,7 +71,7 @@ public abstract class DocumentedGATKFeatureHandler {
      * @return
      */
     public String getDestinationFilename(ClassDoc doc, Class clazz) {
-        return GATKDocUtils.htmlFilenameForClass(clazz);
+        return HelpUtils.htmlFilenameForClass(clazz);
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java
deleted file mode 100644
index 8efeecd7b..000000000
--- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.help;
-
-/**
- * @author depristo
- * @since 8/8/11
- */
-public class GATKDocUtils {
-    private final static String URL_ROOT_FOR_RELEASE_GATKDOCS = "http://www.broadinstitute.org/gsa/gatkdocs/release/";
-    private final static String URL_ROOT_FOR_STABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/stable/";
-    private final static String URL_ROOT_FOR_UNSTABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/unstable/";
-
-    public static String htmlFilenameForClass(Class c) {
-        return c.getName().replace(".", "_") + ".html";
-    }
-
-    public static String helpLinksToGATKDocs(Class c) {
-        String classPath = htmlFilenameForClass(c);
-        StringBuilder b = new StringBuilder();
-        b.append("release  version: ").append(URL_ROOT_FOR_RELEASE_GATKDOCS).append(classPath).append("\n");
-        b.append("stable   version: ").append(URL_ROOT_FOR_STABLE_GATKDOCS).append(classPath).append("\n");
-        b.append("unstable version: ").append(URL_ROOT_FOR_UNSTABLE_GATKDOCS).append(classPath).append("\n");
-        return b.toString();
-    }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java
index 4527c6afe..da4e7bdaf 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java
@@ -32,6 +32,10 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils;
 import java.lang.reflect.Field;
 
 public class HelpUtils {
+    public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = "http://www.broadinstitute.org/gsa/gatkdocs/release/";
+    public final static String URL_ROOT_FOR_STABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/stable/";
+    public final static String URL_ROOT_FOR_UNSTABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/unstable/";
+
     protected static boolean implementsInterface(ProgramElementDoc classDoc, Class... interfaceClasses) {
         for (Class interfaceClass : interfaceClasses)
             if (assignableToClass(classDoc, interfaceClass, false))
@@ -74,4 +78,17 @@ public class HelpUtils {
                 String.format("%s.%s", containingPackage.name(), doc.name()) :
                 String.format("%s", doc.name());
     }
+
+    public static String htmlFilenameForClass(Class c) {
+        return c.getName().replace(".", "_") + ".html";
+    }
+
+    public static String helpLinksToGATKDocs(Class c) {
+        String classPath = htmlFilenameForClass(c);
+        StringBuilder b = new StringBuilder();
+        b.append("release  version: ").append(URL_ROOT_FOR_RELEASE_GATKDOCS).append(classPath).append("\n");
+        b.append("stable   version: ").append(URL_ROOT_FOR_STABLE_GATKDOCS).append(classPath).append("\n");
+        b.append("unstable version: ").append(URL_ROOT_FOR_UNSTABLE_GATKDOCS).append(classPath).append("\n");
+        return b.toString();
+    }
 }
\ No newline at end of file

From 9c17d54cb602bd4af70921fb9ca67163193ad62a Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Thu, 18 Aug 2011 09:39:20 -0400
Subject: [PATCH 10/13] getFeatureClass() now returns Class<T> not Class to
 avoid yesterday's runtime error

---
 .../sting/gatk/refdata/features/refseq/RefSeqCodec.java         | 2 +-
 .../sting/gatk/refdata/features/table/TableCodec.java           | 2 +-
 .../sting/utils/codecs/completegenomics/CGVarCodec.java         | 2 +-
 .../sting/utils/codecs/hapmap/RawHapMapCodec.java               | 2 +-
 .../broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java   | 2 +-
 .../broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java | 2 +-
 .../broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java
index cec40b5bd..43063da17 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java
@@ -104,7 +104,7 @@ public class RefSeqCodec implements ReferenceDependentFeatureCodec<RefSeqFeature
     }
 
     @Override
-    public Class getFeatureType() {
+    public Class<RefSeqFeature> getFeatureType() {
         return RefSeqFeature.class;
     }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java
index ab1ac59d8..4e734294f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java
@@ -51,7 +51,7 @@ public class TableCodec implements ReferenceDependentFeatureCodec {
     }
 
     @Override
-    public Class getFeatureType() {
+    public Class<TableFeature> getFeatureType() {
         return TableFeature.class;
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java
index fef6c4ea0..c30da828e 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java
@@ -117,7 +117,7 @@ public class CGVarCodec implements FeatureCodec {
         return new VariantContext("CGI", array[3], start, end, alleles, VariantContext.NO_NEG_LOG_10PERROR, null, attrs);
     }
 
-    public Class getFeatureType() {
+    public Class<VariantContext> getFeatureType() {
         return VariantContext.class;
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java
index 90878dee7..535f607a1 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java
@@ -82,7 +82,7 @@ public class RawHapMapCodec implements FeatureCodec {
                 headerLine);
     }
 
-    public Class getFeatureType() {
+    public Class<RawHapMapFeature> getFeatureType() {
         return RawHapMapFeature.class;
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
index eada8521f..b5efb49a7 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
@@ -222,7 +222,7 @@ public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec {
         return null;
     }
 
-    public Class getFeatureType() {
+    public Class<SnpEffFeature> getFeatureType() {
         return SnpEffFeature.class;
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java
index e169dbdfc..284c43e90 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java
@@ -178,7 +178,7 @@ public class SoapSNPCodec implements FeatureCodec, NameAwareCodec {
     /**
      * @return VariantContext
      */
-    public Class getFeatureType() {
+    public Class<VariantContext> getFeatureType() {
         return VariantContext.class;
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
index cb505c717..19f58ddaa 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@@ -263,7 +263,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
      *
      * @return the type of record
      */
-    public Class getFeatureType() {
+    public Class<VariantContext> getFeatureType() {
         return VariantContext.class;
     }
 

From c2287c93d7fd2bb067065f93c67245b5605bb7db Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Thu, 18 Aug 2011 10:02:46 -0400
Subject: [PATCH 11/13] Cleanup of codec locations.  No more dbSNPHelper

-- refdata/features now in utils/codecs with the other codecs
-- Deleted dbsnpHelper.  rsID function now in VCFutils.  Remaining code either deleted or put into VariantContextAdaptors
-- Many associated import updates due to code move
---
 .../gatk/refdata/VariantContextAdaptors.java  |  72 +++++++-
 .../gatk/refdata/features/DbSNPHelper.java    | 169 ------------------
 .../sting/gatk/walkers/PileupWalker.java      |   4 -
 .../annotator/VariantAnnotatorEngine.java     |   8 +-
 .../beagle/BeagleOutputToVCFWalker.java       |   2 +-
 .../coverage/DepthOfCoverageWalker.java       |   4 +-
 .../indels/SomaticIndelDetectorWalker.java    |   6 +-
 .../walkers/qc/ValidatingPileupWalker.java    |   2 +-
 .../validation/ValidationAmplicons.java       |   3 +-
 .../walkers/variantutils/VariantsToVCF.java   |   3 +-
 .../codecs}/beagle/BeagleCodec.java           |   2 +-
 .../codecs}/beagle/BeagleFeature.java         |   2 +-
 .../codecs}/refseq/RefSeqCodec.java           |   3 +-
 .../codecs}/refseq/RefSeqFeature.java         |   2 +-
 .../codecs}/refseq/Transcript.java            | 106 +++++------
 .../codecs}/sampileup/SAMPileupCodec.java     |   4 +-
 .../codecs}/sampileup/SAMPileupFeature.java   |   2 +-
 .../codecs}/samread/SAMReadCodec.java         |   2 +-
 .../codecs}/samread/SAMReadFeature.java       |   2 +-
 .../codecs}/table/BedTableCodec.java          |   2 +-
 .../codecs}/table/TableCodec.java             |   2 +-
 .../codecs}/table/TableFeature.java           |   2 +-
 .../sting/utils/codecs/vcf/VCFUtils.java      |  15 ++
 .../commandline/ParsingEngineUnitTest.java    |   2 -
 .../ReferenceOrderedViewUnitTest.java         |   2 +-
 .../rmd/ReferenceOrderedDataPoolUnitTest.java |   2 +-
 .../refdata/RefMetaDataTrackerUnitTest.java   |   2 +-
 .../tracks/FeatureManagerUnitTest.java        |   4 +-
 28 files changed, 163 insertions(+), 268 deletions(-)
 delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/beagle/BeagleCodec.java (99%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/beagle/BeagleFeature.java (97%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/refseq/RefSeqCodec.java (97%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/refseq/RefSeqFeature.java (99%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/refseq/Transcript.java (95%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/sampileup/SAMPileupCodec.java (98%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/sampileup/SAMPileupFeature.java (99%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/samread/SAMReadCodec.java (98%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/samread/SAMReadFeature.java (98%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/table/BedTableCodec.java (94%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/table/TableCodec.java (97%)
 rename public/java/src/org/broadinstitute/sting/{gatk/refdata/features => utils/codecs}/table/TableFeature.java (96%)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
index 902f9d308..bf490e28c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
@@ -1,10 +1,11 @@
 package org.broadinstitute.sting.gatk.refdata;
 
+import net.sf.samtools.util.SequenceUtil;
 import org.broad.tribble.Feature;
+import org.broad.tribble.annotation.Strand;
 import org.broad.tribble.dbsnp.DbSNPFeature;
 import org.broad.tribble.gelitext.GeliTextFeature;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
 import org.broadinstitute.sting.utils.classloader.PluginManager;
 import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
@@ -92,6 +93,67 @@ public class VariantContextAdaptors {
     // --------------------------------------------------------------------------------------------------------------
 
     private static class DBSnpAdaptor implements VCAdaptor {
+        private static boolean isSNP(DbSNPFeature feature) {
+            return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact");
+        }
+
+        private static boolean isMNP(DbSNPFeature feature) {
+            return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range");
+        }
+
+        private static boolean isInsertion(DbSNPFeature feature) {
+            return feature.getVariantType().contains("insertion");
+        }
+
+        private static boolean isDeletion(DbSNPFeature feature) {
+            return feature.getVariantType().contains("deletion");
+        }
+
+        private static boolean isIndel(DbSNPFeature feature) {
+            return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature);
+        }
+
+        public static boolean isComplexIndel(DbSNPFeature feature) {
+            return feature.getVariantType().contains("in-del");
+        }
+
+        /**
+         * gets the alternate alleles.  This method should return all the alleles present at the location,
+         * NOT including the reference base.  This is returned as a string list with no guarantee ordering
+         * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
+         * frequency).
+         *
+         * @return an alternate allele list
+         */
+        public static List<String> getAlternateAlleleList(DbSNPFeature feature) {
+            List<String> ret = new ArrayList<String>();
+            for (String allele : getAlleleList(feature))
+                if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele);
+            return ret;
+        }
+
+        /**
+         * gets the alleles.  This method should return all the alleles present at the location,
+         * including the reference base.  The first allele should always be the reference allele, followed
+         * by an unordered list of alternate alleles.
+         *
+         * @return an alternate allele list
+         */
+        public static List<String> getAlleleList(DbSNPFeature feature) {
+            List<String> alleleList = new ArrayList<String>();
+            // add ref first
+            if ( feature.getStrand() == Strand.POSITIVE )
+                alleleList = Arrays.asList(feature.getObserved());
+            else
+                for (String str : feature.getObserved())
+                    alleleList.add(SequenceUtil.reverseComplement(str));
+            if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase())
+                    && !alleleList.get(0).equals(feature.getNCBIRefBase()) )
+                Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0);
+
+            return alleleList;
+        }
+
         /**
          * Converts non-VCF formatted dbSNP records to VariantContext. 
          * @return DbSNPFeature.
@@ -102,18 +164,18 @@ public class VariantContextAdaptors {
         @Override        
         public VariantContext convert(String name, Object input, ReferenceContext ref) {
             DbSNPFeature dbsnp = (DbSNPFeature)input;
-            if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp)) )
+            if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) )
                 return null;
-            Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true);
+            Allele refAllele = Allele.create(dbsnp.getNCBIRefBase(), true);
 
-            if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || DbSNPHelper.isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
+            if ( isSNP(dbsnp) || isIndel(dbsnp) || isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
                 // add the reference allele
                 List<Allele> alleles = new ArrayList<Allele>();
                 alleles.add(refAllele);
 
                 // add all of the alt alleles
                 boolean sawNullAllele = refAllele.isNull();
-                for ( String alt : DbSNPHelper.getAlternateAlleleList(dbsnp) ) {
+                for ( String alt : getAlternateAlleleList(dbsnp) ) {
                     if ( ! Allele.acceptableAlleleBases(alt) ) {
                         //System.out.printf("Excluding dbsnp record %s%n", dbsnp);
                         return null;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java
deleted file mode 100644
index a2132cee5..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.refdata.features;
-
-import net.sf.samtools.util.SequenceUtil;
-import org.broad.tribble.annotation.Strand;
-import org.broad.tribble.dbsnp.DbSNPFeature;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * this class contains static helper methods for DbSNP
- */
-public class DbSNPHelper {
-
-    private DbSNPHelper() {} // don't make a DbSNPHelper
-
-    public static String rsIDOfFirstRealVariant(List<VariantContext> VCs, VariantContext.Type type) {
-        if ( VCs == null )
-            return null;
-
-        String rsID = null;
-        for ( VariantContext vc : VCs ) {
-            if ( vc.getType() == type ) {
-                rsID = vc.getID();
-                break;
-            }
-        }
-
-        return rsID;
-    }
-
-    /**
-     * get the -1 * (log 10 of the error value)
-     *
-     * @return the log based error estimate
-     */
-    public static double getNegLog10PError(DbSNPFeature feature) {
-        return 4; // -log10(0.0001)
-    }
-
-    //
-    // What kind of variant are we?
-    //
-    // ----------------------------------------------------------------------
-    public static boolean isSNP(DbSNPFeature feature) {
-        return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact");
-    }
-
-    public static boolean isMNP(DbSNPFeature feature) {
-        return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range");
-    }
-
-    public static String toMediumString(DbSNPFeature feature) {
-        String s = String.format("%s:%d:%s:%s", feature.getChr(), feature.getStart(), feature.getRsID(), Utils.join("",feature.getObserved()));
-        if (isSNP(feature)) s += ":SNP";
-        if (isIndel(feature)) s += ":Indel";
-        if (isHapmap(feature)) s += ":Hapmap";
-        if (is2Hit2Allele(feature)) s += ":2Hit";
-        return s;
-    }
-
-    public static boolean isInsertion(DbSNPFeature feature) {
-        return feature.getVariantType().contains("insertion");
-    }
-
-    public static boolean isDeletion(DbSNPFeature feature) {
-        return feature.getVariantType().contains("deletion");
-    }
-
-    public static boolean isIndel(DbSNPFeature feature) {
-        return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || DbSNPHelper.isComplexIndel(feature);
-    }
-
-    public static boolean isComplexIndel(DbSNPFeature feature) {
-        return feature.getVariantType().contains("in-del");
-    }
-
-    public static boolean isHapmap(DbSNPFeature feature) {
-        return feature.getValidationStatus().contains("by-hapmap");
-    }
-
-    public static boolean is2Hit2Allele(DbSNPFeature feature) {
-        return feature.getValidationStatus().contains("by-2hit-2allele");
-    }
-
-    public static boolean is1000genomes(DbSNPFeature feature) {
-        return feature.getValidationStatus().contains("by-1000genomes");
-    }
-
-    public static boolean isMQ1(DbSNPFeature feature) {
-        return feature.getWeight() == 1;
-    }
-
-    /**
-     * gets the alternate alleles.  This method should return all the alleles present at the location,
-     * NOT including the reference base.  This is returned as a string list with no guarantee ordering
-     * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
-     * frequency).
-     *
-     * @return an alternate allele list
-     */
-    public static List<String> getAlternateAlleleList(DbSNPFeature feature) {
-        List<String> ret = new ArrayList<String>();
-        for (String allele : getAlleleList(feature))
-            if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele);
-        return ret;
-    }
-
-    public static boolean onFwdStrand(DbSNPFeature feature) {
-        return feature.getStrand() == Strand.POSITIVE;
-    }
-
-    public static String getReference(DbSNPFeature feature) {
-        return feature.getNCBIRefBase();
-    }
-
-    public static String toSimpleString(DbSNPFeature feature) {
-        return String.format("%s:%s:%s", feature.getRsID(), feature.getObserved(), (feature.getStrand() == Strand.POSITIVE) ? "+" : "-");
-    }
-
-    /**
-     * gets the alleles.  This method should return all the alleles present at the location,
-     * including the reference base.  The first allele should always be the reference allele, followed
-     * by an unordered list of alternate alleles.
-     *
-     * @return an alternate allele list
-     */
-    public static List<String> getAlleleList(DbSNPFeature feature) {
-        List<String> alleleList = new ArrayList<String>();
-            // add ref first
-            if ( onFwdStrand(feature) )
-                alleleList = Arrays.asList(feature.getObserved());
-            else
-                for (String str : feature.getObserved())
-                    alleleList.add(SequenceUtil.reverseComplement(str));
-            if ( alleleList.size() > 0 && alleleList.contains(getReference(feature)) && !alleleList.get(0).equals(getReference(feature)) )
-                Collections.swap(alleleList, alleleList.indexOf(getReference(feature)), 0);
-
-        return alleleList;
-    }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java
index 6243a6cc0..4d8be4800 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java
@@ -26,7 +26,6 @@
 package org.broadinstitute.sting.gatk.walkers;
 
 import org.broad.tribble.Feature;
-import org.broad.tribble.dbsnp.DbSNPFeature;
 import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.commandline.Input;
 import org.broadinstitute.sting.commandline.Output;
@@ -34,9 +33,6 @@ import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
-import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
-import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
 import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index a7837813a..01926a7f3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -29,15 +29,11 @@ import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.Genotype;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@@ -158,7 +154,7 @@ public class VariantAnnotatorEngine {
     private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
         for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
             if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
-                String rsID = DbSNPHelper.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
+                String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
                 infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
                 // annotate dbsnp id if available and not already there
                 if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) )
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
index 51fe543df..60f0fcb0a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
@@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature;
+import org.broadinstitute.sting.utils.codecs.beagle.BeagleFeature;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.SampleUtils;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
index 708d6bb94..90036407f 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
@@ -32,8 +32,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
-import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
-import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
+import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec;
+import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
 import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
index b2d30235a..9f6ac2a91 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
@@ -39,9 +39,9 @@ import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter;
 import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper;
 import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
-import org.broadinstitute.sting.gatk.refdata.features.refseq.Transcript;
-import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
-import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
+import org.broadinstitute.sting.utils.codecs.refseq.Transcript;
+import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec;
+import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
 import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
index bd25a73e0..ca30d875b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
@@ -32,7 +32,7 @@ import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature;
+import org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature;
 import org.broadinstitute.sting.gatk.walkers.*;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
index 7653f511f..61149e5d9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
@@ -14,9 +14,8 @@ import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
+import org.broadinstitute.sting.utils.codecs.table.TableFeature;
 import org.broadinstitute.sting.gatk.walkers.DataSource;
-import org.broadinstitute.sting.gatk.walkers.RMD;
 import org.broadinstitute.sting.gatk.walkers.Requires;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.utils.BaseUtils;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
index 61851abe2..12e62a249 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
@@ -33,7 +33,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
-import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
 import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
 import org.broadinstitute.sting.gatk.walkers.*;
@@ -120,7 +119,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
         if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) )
             return 0;
 
-        String rsID = dbsnp == null ? null : DbSNPHelper.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP);
+        String rsID = dbsnp == null ? null : VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP);
 
         Collection<VariantContext> contexts = getVariantContexts(tracker, ref);
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java
similarity index 99%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java
index 5e536d4c1..e328c9286 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.sting.gatk.refdata.features.beagle;
+package org.broadinstitute.sting.utils.codecs.beagle;
 /*
  * Copyright (c) 2010 The Broad Institute
  *
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java
similarity index 97%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java
index e6832754d..0aa9ecba2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java
@@ -22,7 +22,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-package org.broadinstitute.sting.gatk.refdata.features.beagle;
+package org.broadinstitute.sting.utils.codecs.beagle;
 
 import org.broad.tribble.Feature;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
similarity index 97%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
index 43063da17..391715c63 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@@ -1,6 +1,5 @@
-package org.broadinstitute.sting.gatk.refdata.features.refseq;
+package org.broadinstitute.sting.utils.codecs.refseq;
 
-import org.apache.commons.io.filefilter.FalseFileFilter;
 import org.broad.tribble.Feature;
 import org.broad.tribble.TribbleException;
 import org.broad.tribble.readers.LineReader;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java
similarity index 99%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java
index a38d45428..c04ca8592 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.sting.gatk.refdata.features.refseq;
+package org.broadinstitute.sting.utils.codecs.refseq;
 
 import org.broad.tribble.Feature;
 import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/Transcript.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java
similarity index 95%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/Transcript.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java
index d8bf12810..3e8a4fb34 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/Transcript.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java
@@ -1,53 +1,53 @@
-package org.broadinstitute.sting.gatk.refdata.features.refseq;
-
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.HasGenomeLocation;
-
-import java.util.List;
-
-/**
- * Created by IntelliJ IDEA.
- * User: asivache
- * Date: Sep 22, 2009
- * Time: 5:22:30 PM
- * To change this template use File | Settings | File Templates.
- */
-public interface Transcript extends HasGenomeLocation {
-
-    /** Returns id of the transcript (RefSeq NM_* id) */
-    public String getTranscriptId();
-    /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */
-    public int getStrand();
-    /** Returns transcript's full genomic interval (includes all exons with UTRs) */
-    public GenomeLoc getLocation();
-    /** Returns genomic interval of the coding sequence (does not include
-     * UTRs, but still includes introns, since it's a single interval on the DNA)
-     */
-    public GenomeLoc getCodingLocation();
-    /** Name of the gene this transcript corresponds to (typically NOT gene id such as Entrez etc,
-     * but the implementation can decide otherwise)
-     */
-    public String getGeneName();
-    /** Number of exons in this transcript */
-    public int getNumExons();
-    /** Genomic location of the n-th exon; expected to throw an exception (runtime) if n is out of bounds */
-    public GenomeLoc getExonLocation(int n);
-
-    /** Returns the list of all exons in this transcript, as genomic intervals */
-    public List<GenomeLoc> getExons();
-
-    /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */
-    public boolean overlapsP (GenomeLoc that);
-
-    /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript.
-      * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons,
-      * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript.
-      * @see #overlapsExonP
-      */
-    public boolean overlapsCodingP (GenomeLoc that);
-
-    /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */
-    public boolean overlapsExonP (GenomeLoc that);
-
-
-}
+package org.broadinstitute.sting.utils.codecs.refseq;
+
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.HasGenomeLocation;
+
+import java.util.List;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Sep 22, 2009
+ * Time: 5:22:30 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public interface Transcript extends HasGenomeLocation {
+
+    /** Returns id of the transcript (RefSeq NM_* id) */
+    public String getTranscriptId();
+    /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */
+    public int getStrand();
+    /** Returns transcript's full genomic interval (includes all exons with UTRs) */
+    public GenomeLoc getLocation();
+    /** Returns genomic interval of the coding sequence (does not include
+     * UTRs, but still includes introns, since it's a single interval on the DNA)
+     */
+    public GenomeLoc getCodingLocation();
+    /** Name of the gene this transcript corresponds to (typically NOT gene id such as Entrez etc,
+     * but the implementation can decide otherwise)
+     */
+    public String getGeneName();
+    /** Number of exons in this transcript */
+    public int getNumExons();
+    /** Genomic location of the n-th exon; expected to throw an exception (runtime) if n is out of bounds */
+    public GenomeLoc getExonLocation(int n);
+
+    /** Returns the list of all exons in this transcript, as genomic intervals */
+    public List<GenomeLoc> getExons();
+
+    /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */
+    public boolean overlapsP (GenomeLoc that);
+
+    /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript.
+      * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons,
+      * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript.
+      * @see #overlapsExonP
+      */
+    public boolean overlapsCodingP (GenomeLoc that);
+
+    /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */
+    public boolean overlapsExonP (GenomeLoc that);
+
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
similarity index 98%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
index 43e2c3ff5..f4048d37d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
@@ -23,7 +23,7 @@
  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-package org.broadinstitute.sting.gatk.refdata.features.sampileup;
+package org.broadinstitute.sting.utils.codecs.sampileup;
 
 import org.broad.tribble.Feature;
 import org.broad.tribble.FeatureCodec;
@@ -35,7 +35,7 @@ import java.util.ArrayList;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import static org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature.VariantType;
+import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType;
 
 /**
  * A Tribble encoder / decoder for SAM pileup data. 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
similarity index 99%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
index 378f26934..eb33243e3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
@@ -23,7 +23,7 @@
  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-package org.broadinstitute.sting.gatk.refdata.features.sampileup;
+package org.broadinstitute.sting.utils.codecs.sampileup;
 
 import net.sf.samtools.util.StringUtil;
 import org.broad.tribble.Feature;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java
similarity index 98%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java
index 039b8adde..f6861e585 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java
@@ -22,7 +22,7 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-package org.broadinstitute.sting.gatk.refdata.features.samread;
+package org.broadinstitute.sting.utils.codecs.samread;
 
 import net.sf.samtools.Cigar;
 import net.sf.samtools.TextCigarCodec;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java
similarity index 98%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java
index 7f12b2b2f..fc1bf89af 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java
@@ -22,7 +22,7 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-package org.broadinstitute.sting.gatk.refdata.features.samread;
+package org.broadinstitute.sting.utils.codecs.samread;
 
 import org.broad.tribble.Feature;
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java
similarity index 94%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java
index 745ccdd9f..6fe1907e3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.sting.gatk.refdata.features.table;
+package org.broadinstitute.sting.utils.codecs.table;
 
 import org.broad.tribble.Feature;
 import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
similarity index 97%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
index 4e734294f..2ce7c679e 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.sting.gatk.refdata.features.table;
+package org.broadinstitute.sting.utils.codecs.table;
 
 import org.broad.tribble.Feature;
 import org.broad.tribble.readers.LineReader;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java
similarity index 96%
rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java
rename to public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java
index ca73ee960..a85849f0b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.sting.gatk.refdata.features.table;
+package org.broadinstitute.sting.utils.codecs.table;
 
 import org.broad.tribble.Feature;
 import org.broadinstitute.sting.utils.GenomeLoc;
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
index f43891e77..c0a04c81f 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
@@ -180,4 +180,19 @@ public class VCFUtils {
 
         return new HashSet<VCFHeaderLine>(map.values());
     }
+
+    public static String rsIDOfFirstRealVariant(List<VariantContext> VCs, VariantContext.Type type) {
+        if ( VCs == null )
+            return null;
+
+        String rsID = null;
+        for ( VariantContext vc : VCs ) {
+            if ( vc.getType() == type ) {
+                rsID = vc.getID();
+                break;
+            }
+        }
+
+        return rsID;
+    }
 }
\ No newline at end of file
diff --git a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java
index 79e9172dd..013a37a88 100755
--- a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java
@@ -26,7 +26,6 @@
 package org.broadinstitute.sting.commandline;
 
 import org.broad.tribble.Feature;
-import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.testng.Assert;
@@ -35,7 +34,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.testng.annotations.BeforeMethod;
 import org.testng.annotations.Test;
 
-import javax.script.Bindings;
 import java.util.List;
 import java.util.EnumSet;
 /**
diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java
index f782580e2..85ae1e1f7 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java
@@ -10,7 +10,7 @@ import org.testng.Assert;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.gatk.datasources.reads.Shard;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
+import org.broadinstitute.sting.utils.codecs.table.TableFeature;
 import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java
index bd4f93d24..d45f6e667 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java
@@ -4,7 +4,7 @@ import org.broadinstitute.sting.commandline.Tags;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
 import org.testng.Assert;
 import org.broadinstitute.sting.BaseTest;
-import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
+import org.broadinstitute.sting.utils.codecs.table.TableFeature;
 import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
 import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
 import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType;
diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java
index fbd30bc8a..1e39fd26f 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java
@@ -31,7 +31,7 @@ import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.commandline.Tags;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
+import org.broadinstitute.sting.utils.codecs.table.TableFeature;
 import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
 import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
 import org.broadinstitute.sting.utils.GenomeLoc;
diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java
index 5d662ffed..90262b9c1 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java
@@ -29,8 +29,8 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
 import org.broad.tribble.Feature;
 import org.broad.tribble.FeatureCodec;
 import org.broadinstitute.sting.BaseTest;
-import org.broadinstitute.sting.gatk.refdata.features.table.BedTableCodec;
-import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
+import org.broadinstitute.sting.utils.codecs.table.BedTableCodec;
+import org.broadinstitute.sting.utils.codecs.table.TableFeature;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;

From bb79d3edae56c8f32fb26425db86cdec94ed7d58 Mon Sep 17 00:00:00 2001
From: Ryan Poplin <rpoplin@broadinstitute.org>
Date: Thu, 18 Aug 2011 10:57:48 -0400
Subject: [PATCH 12/13] Added GATKDocs for the BQSR walkers.

---
 .../analyzecovariates/AnalyzeCovariates.java  | 75 ++++++++++++++--
 .../sting/analyzecovariates/package-info.java |  4 +
 .../CountCovariatesGatherer.java              | 28 +++++-
 .../recalibration/CountCovariatesWalker.java  | 70 +++++++++++----
 .../recalibration/RecalDataManager.java       |  7 ++
 .../RecalibrationArgumentCollection.java      | 15 ++++
 .../TableRecalibrationWalker.java             | 89 ++++++++++++++-----
 7 files changed, 241 insertions(+), 47 deletions(-)
 create mode 100644 public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java

diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
index b9e380295..9b316f077 100755
--- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
+++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
@@ -28,11 +28,13 @@ package org.broadinstitute.sting.analyzecovariates;
 import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.commandline.CommandLineProgram;
 import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.gatk.CommandLineGATK;
 import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
 import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
 import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
 import org.broadinstitute.sting.utils.classloader.PluginManager;
 import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
 import org.broadinstitute.sting.utils.text.XReadLines;
 
 import java.io.*;
@@ -42,19 +44,71 @@ import java.util.Map;
 import java.util.regex.Pattern;
 
 /**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Dec 1, 2009
+ * Call R scripts to plot residual error versus the various covariates.
  *
- * Create collapsed versions of the recal csv file and call R scripts to plot residual error versus the various covariates.
+ * <p>
+ * After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which
+ * reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically
+ * show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities).
+ * In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run
+ * CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated
+ * by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual
+ * error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated
+ * bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration.
+ *
+ * <p>
+ * The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into
+ * each of the quality score estimates. It is defined as follows for N, the number of observations:
+ *
+ * <ul>
+ * <li>light blue means N < 1,000</li>
+ * <li>cornflower blue means 1,000 <= N < 10,000</li>
+ * <li>dark blue means N >= 10,000</li>
+ * <li>The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically
+ * meaningless and so aren't included in any of the numerical calculations.</li>
+ * </ul>
+ *
+ * <p>
+ * NOTE: For those running this tool externally from the Broad, it is crucial to note that both the -Rscript and -resources options
+ * must be changed from the default. -Rscript needs to point to your installation of Rscript (this is the scripting version of R,
+ * not the interactive version) while -resources needs to point to the folder holding the R scripts that are used. For those using
+ * this tool as part of the Binary Distribution the -resources should point to the resources folder that is part of the tarball.
+ * For those using this tool by building from the git repository the -resources should point to the R/ subdirectory of the Sting checkout.
+ *
+ * <p>
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots.
+ * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
+ *
+ * <h2>Input</h2>
+ * <p>
+ * The recalibration table file in CSV format that was generated by the CountCovariates walker.
+ * </p>
+ *
+ * <h2>Examples</h2>
+ * <pre>
+ * java -Xmx4g -jar AnalyzeCovariates.jar \
+ *   -recalFile /path/to/recal.table.csv  \
+ *   -outputDir /path/to/output_dir/  \
+ *   -resources resources/  \
+ *   -ignoreQ 5
+ * </pre>
+ * 
  */
 
+@DocumentedGATKFeature(
+    groupName = "AnalyzeCovariates",
+    summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
 public class AnalyzeCovariates extends CommandLineProgram {
 
     /////////////////////////////
     // Command Line Arguments
     /////////////////////////////
-
+    /**
+     * After the header, data records occur one per line until the end of the file. The first several items on a line are the
+     * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
+     * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
+     * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+     */
     @Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false)
     private String RECAL_FILE = "output.recal_data.csv";
     @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
@@ -67,11 +121,20 @@ public class AnalyzeCovariates extends CommandLineProgram {
     private int IGNORE_QSCORES_LESS_THAN = 5;
     @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)            
     private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
+
+    /**
+     * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
+     * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
+     */
     @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 50")
     private int MAX_QUALITY_SCORE = 50;
+
+    /**
+     * This argument is useful for comparing before/after plots and you want the axes to match each other.
+     */
     @Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
     private int MAX_HISTOGRAM_VALUE = 0;
-    @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, this value will be the max value of the histogram plots")
+    @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting")
     private boolean DO_INDEL_QUALITY = false;
 
 
diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java
new file mode 100644
index 000000000..9350e4a66
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java
@@ -0,0 +1,4 @@
+/**
+ * Package to plot residual accuracy versus error covariates for the base quality score recalibrator.
+ */
+package org.broadinstitute.sting.analyzecovariates;
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java
index fc6b3daee..9b0824ed0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java
@@ -1,3 +1,28 @@
+/*
+ * Copyright (c) 2011 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
 package org.broadinstitute.sting.gatk.walkers.recalibration;
 
 import org.broadinstitute.sting.commandline.Gatherer;
@@ -12,11 +37,8 @@ import java.util.List;
 import java.util.regex.Pattern;
 
 /**
- * Created by IntelliJ IDEA.
  * User: carneiro
  * Date: 3/29/11
- * Time: 3:54 PM
- * To change this template use File | Settings | File Templates.
  */
 
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
index b4739f366..5ffc61fe3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
@@ -50,22 +50,47 @@ import java.util.List;
 import java.util.Map;
 
 /**
- * First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
+ * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
  *
- * This walker is designed to work as the first pass in a two-pass processing step.
- * It does a by-locus traversal operating only at sites that are not in dbSNP.
- * We assume that all reference mismatches we see are therefore errors and indicative of poor base quality.
- * This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide)
- * Since there is a large amount of data one can then calculate an empirical probability of error
- *   given the particular covariates seen at this site, where p(error) = num mismatches / num observations
- * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score)
- * The first non-comment line of the output file gives the name of the covariates that were used for this calculation.
+ * <p>
+ * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
+ * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
+ * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
+ * reported quality score, cycle, and dinucleotide). Since there is a large amount of data one can then calculate an empirical
+ * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
+ * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score).
+ * <p>
+ * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
  *
- * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified
- * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
+ * <p>
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots.
+ * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
+ *
+ * <h2>Input</h2>
+ * <p>
+ * A database of known polymorphic sites to skip over.
+ * </p>
+ *
+ * <h2>Output</h2>
+ * <p>
+ * A recalibration table file in CSV format that is used by the TableRecalibration walker.
+ * </p>
+ *
+ * <h2>Examples</h2>
+ * <pre>
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
+ *   -knownSites another/optional/setOfSitesToMask.vcf \
+ *   -I my_reads.bam \
+ *   -T CountCovariates \
+ *   -cov ReadGroupCovariate \
+ *   -cov QualityScoreCovariate \
+ *   -cov CycleCovariate \
+ *   -cov DinucCovariate \
+ *   -recalFile my_reads.recal_data.csv
+ * </pre>
  *
- * @author rpoplin
- * @since Nov 3, 2009
  */
 
 @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@@ -96,8 +121,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
      */
     @Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
     public List<RodBinding<Feature>> knownSites = Collections.emptyList();
-    @Output
-    PrintStream out;
+
+    /**
+     * After the header, data records occur one per line until the end of the file. The first several items on a line are the
+     * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
+     * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
+     * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+     */
     @Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
     @Gather(CountCovariatesGatherer.class)
     public PrintStream RECAL_FILE;
@@ -114,6 +144,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
     /////////////////////////////
     @Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
     private boolean DONT_SORT_OUTPUT = false;
+
+    /**
+     * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
+     */
     @Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
     private boolean RUN_WITHOUT_DBSNP = false;
 
@@ -178,11 +212,11 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
 
         // Print and exit if that's what was requested
         if ( LIST_ONLY ) {
-            out.println( "Available covariates:" );
+            logger.info( "Available covariates:" );
             for( Class<?> covClass : covariateClasses ) {
-                out.println( covClass.getSimpleName() );
+                logger.info( covClass.getSimpleName() );
             }
-            out.println();
+            logger.info("");
 
             System.exit( 0 ); // Early exit here because user requested it
         }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
index e6d0b306c..ac25d4f13 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
@@ -66,15 +66,22 @@ public class RecalDataManager {
     private static boolean warnUserNullPlatform = false;
 
     public enum SOLID_RECAL_MODE {
+        /** Treat reference inserted bases as reference matching bases. Very unsafe! */
         DO_NOTHING,
+        /** Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. */
         SET_Q_ZERO,
+        /** In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. */
         SET_Q_ZERO_BASE_N,
+        /** Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. */
         REMOVE_REF_BIAS
     }
 
     public enum SOLID_NOCALL_STRATEGY {
+        /** When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. */
         THROW_EXCEPTION,
+        /** Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. */
         LEAVE_READ_UNRECALIBRATED,
+        /** Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. */
         PURGE_READ
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
index 0e7f7d111..f31e2fc5b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
@@ -51,12 +51,27 @@ public class RecalibrationArgumentCollection {
     public String FORCE_PLATFORM = null;
     @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
     public int WINDOW_SIZE = 5;
+
+    /**
+     * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
+     */
     @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
     public int HOMOPOLYMER_NBACK = 7;
     @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false)
     public boolean EXCEPTION_IF_NO_TILE = false;
+
+    /**
+     * CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
+     * reads which have had the reference inserted because of color space inconsistencies.
+     */
     @Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
     public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO;
+
+    /**
+     * CountCovariates and TableRecalibration accept a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
+     * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
+     * their color space tag can not be recalibrated.
+     */
     @Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false)
     public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
index a044abecb..af7148803 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
@@ -52,19 +52,38 @@ import java.util.ResourceBundle;
 import java.util.regex.Pattern;
 
 /**
- * Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
+ * Second pass of the recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
  *
- * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.
+ * <p>
+ * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each
+ * base in each read this walker calculates various user-specified covariates (such as read group, reported quality score,
+ * cycle, and dinuc). Using these values as a key in a large hashmap the walker calculates an empirical base quality score
+ * and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads.
  *
- * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
- * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read.
- * This walker then outputs a new bam file with these updated (recalibrated) reads.
+ * <p>
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots.
+ * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
  *
- * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker.
- * Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
+ * <h2>Input</h2>
+ * <p>
+ * The recalibration table file in CSV format that was generated by the CountCovariates walker.
+ * </p>
+ *
+ * <h2>Output</h2>
+ * <p>
+ * A bam file in which the quality scores in each read have been recalibrated.
+ * </p>
+ *
+ * <h2>Examples</h2>
+ * <pre>
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -I my_reads.bam \
+ *   -T TableRecalibration \
+ *   -o my_reads.recal.bam \
+ *   -recalFile my_reads.recal_data.csv
+ * </pre>
  *
- * @author rpoplin
- * @since Nov 3, 2009
  */
 
 @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@@ -79,24 +98,54 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
     /////////////////////////////
     @ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
 
-    @Input(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file")
-    public File RECAL_FILE = new File("output.recal_data.csv");
-
     /////////////////////////////
     // Command Line Arguments
     /////////////////////////////
-    @Argument(fullName="output_bam", shortName="outputBam", doc="Please use --out instead", required=false)
-    @Deprecated
-    protected String outbam;
-
-    @Output(doc="The output BAM file", required=true)
+    /**
+     * After the header, data records occur one per line until the end of the file. The first several items on a line are the
+     * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
+     * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
+     * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+     */
+    @Input(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the input covariates table recalibration .csv file")
+    public File RECAL_FILE = null;
+    /**
+     * A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
+     */
+    @Output(doc="The output recalibrated BAM file", required=true)
     private StingSAMFileWriter OUTPUT_BAM = null;
-    @Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
+
+    /**
+     * TableRacalibration accepts a --preserve_qscores_less_than / -pQ <Q> flag that instructs TableRecalibration to not modify
+     * quality scores less than <Q> but rather just write them out unmodified in the recalibrated BAM file. This is useful
+     * because Solexa writes Q2 and Q3 bases when the machine has really gone wrong. This would be fine in and of itself,
+     * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect,
+     * your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
+     * are unmodified during recalibration, so they don't get inappropriately evaluated.
+     */
+    @Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
     private int PRESERVE_QSCORES_LESS_THAN = 5;
-    @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1")
+
+    /**
+     * By default TableRecalibration applies a Yates' correction to account for overfitting when it calculates the empirical
+     * quality score, in particular, ( # mismatches + 1 ) / ( # observations + 1 ). TableRecalibration accepts a --smoothing / -sm <int>
+     * argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
+     * --smoothing 15 for a large amount of smoothing.
+     */
+    @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
     private int SMOOTHING = 1;
-    @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default=50")
+
+    /**
+     * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
+     * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
+     */
+    @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores")
     private int MAX_QUALITY_SCORE = 50;
+
+    /**
+     * By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
+     * the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
+     */
     @Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read")
     private boolean DO_NOT_WRITE_OQ = false;
 

From a45498150a419d1e3a0fd8814e13793cdc5f7412 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Thu, 18 Aug 2011 11:18:29 -0400
Subject: [PATCH 13/13] Remove non-ascii char

---
 .../sting/gatk/walkers/indels/RealignerTargetCreator.java       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
index 08ed1af52..145d0327c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
@@ -57,7 +57,7 @@ import java.util.List;
  * <p>
  * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
  * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
- * or deletion (indels) in the individual�s genome with respect to the reference genome.  Such alignment artifacts result in many bases mismatching
+ * or deletion (indels) in the individual's genome with respect to the reference genome.  Such alignment artifacts result in many bases mismatching
  * the reference near the misalignment, which are easily mistaken as SNPs.  Moreover, since read mapping algorithms operate on each read independently,
  * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads.  Consequently, even when some reads are
  * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,

	ALT	REF	Predictive Value
called alt	True Positive (TP)	False Positive (FP)	Positive PV
called ref	False Negative (FN)	True Negative (TN)	Negative PV