From 671330950d6c0249f92e37b0dd68dea58ced23e8 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Wed, 17 Aug 2011 15:46:31 -0400
Subject: [PATCH 1/7] Updated Beagle walker for gatkdocs format. Pushed
 unsupported, undocumented arguments to @Hidden

---
 .../beagle/BeagleOutputToVCFWalker.java       | 37 +++++++++++----
 .../beagle/ProduceBeagleInputWalker.java      | 45 ++++++++++++++++---
 2 files changed, 67 insertions(+), 15 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
index 40e6748ed..aca176bc2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
@@ -48,6 +48,31 @@ import static java.lang.Math.log10;
 
 /**
  * Takes files produced by Beagle imputation engine and creates a vcf with modified annotations.
+ *
+ * <p>This walker is intended to be run after Beagle has successfully executed. The full calling sequence for using Beagle along with the GATK is:      </p>
+ *
+ * <p>1. Run ProduceBeagleInputWalker.  </p>
+ * <p>2. Run Beagle</p>
+ * <p>3. Uncompress output files</p>
+ * <p>4. Run BeagleOutputToVCFWalker.</p>
+ *
+ *
+ * Note that this walker requires all input files produced by Beagle.
+ *
+ *
+ * <h2>Example</h2>
+ * <pre>
+ *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
+ *      -R reffile.fasta -T BeagleOutputToVCF \
+ *      -B:variant,VCF input_vcf.vcf \
+ *      -B:beagleR2,BEAGLE /myrun.beagle_output.r2 \
+ *      -B:beaglePhased,BEAGLE /myrun.beagle_output.phased \
+ *      -B:beagleProbs,BEAGLE /myrun.beagle_output.gprobs \
+ *      --out output_vcf.vcf
+ *      </pre>
+
+ <p> Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them </p>
+
  */
 public class BeagleOutputToVCFWalker  extends RodWalker<Integer, Integer> {
 
@@ -57,22 +82,18 @@ public class BeagleOutputToVCFWalker  extends RodWalker<Integer, Integer> {
     @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false)
     public RodBinding<VariantContext> comp;
 
-    @Input(fullName="beagleR2", shortName = "beagleR2", doc="VCF file", required=true)
+    @Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true)
     public RodBinding<BeagleFeature> beagleR2;
 
-    @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="VCF file", required=true)
+    @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true)
     public RodBinding<BeagleFeature> beagleProbs;
 
-    @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="VCF file", required=true)
+    @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true)
     public RodBinding<BeagleFeature> beaglePhased;
 
-    @Output(doc="File to which variants should be written",required=true)
+    @Output(doc="VCF File to which variants should be written",required=true)
     protected VCFWriter vcfWriter = null;
 
-    @Argument(fullName="output_file", shortName="output", doc="Please use --out instead" ,required=false)
-    @Deprecated
-    protected String oldOutputArg;
-
     @Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic.  Useful for imputing a sample's genotypes from a reference panel" ,required=false)
     public boolean DONT_FILTER_MONOMORPHIC_SITES = false;
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
index c1508cf83..6ac817555 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
@@ -48,19 +48,45 @@ import java.io.PrintStream;
 import java.util.*;
 
 /**
- * Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input variant file
+ *  Converts the input VCF into a format accepted by the Beagle imputation/analysis program.
+ * <p>
+ *
+ * <h2>Input</h2>
+ * <p>
+ * A VCF with variants to convert to Beagle format
+ * </p>
+ *
+ * <h2>Outputs</h2>
+ * <p>
+ * A single text file which can be fed to Beagle
+ * </p>
+ * <p>
+ * Optional: A file with a list of markers
+ * </p>
+  *
+ * <h2>Examples</h2>
+ * <pre>
+ *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
+ *      -R reffile.fasta -T ProduceBeagleInput \
+ *      -B:variant,VCF path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
+ * </pre>
+ *
  */
+
 public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
 
     @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
 
-    @Input(fullName="validation", shortName = "validation", doc="Input VCF file", required=false)
+    @Hidden
+    @Input(fullName="validation", shortName = "validation", doc="Validation VCF file", required=false)
     public RodBinding<VariantContext> validation;
 
+
     @Output(doc="File to which BEAGLE input should be written",required=true)
     protected PrintStream  beagleWriter = null;
 
-    @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false)
+    @Hidden
+     @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false)
     protected PrintStream  markers = null;
     int markerCounter = 1;
 
@@ -73,14 +99,19 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
     @Argument(doc="VQSqual key", shortName = "vqskey", required=false)
     protected String VQSLOD_KEY = "VQSqual";
 
-    @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
+    @Hidden
+     @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
     public double insertedNoCallRate  = 0;
-    @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
+    @Hidden
+     @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
     public double validationPrior = -1.0;
-    @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
+    @Hidden
+     @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
     public double bootstrap = 0.0;
-    @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
+    @Hidden
+     @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
     VCFWriter bootstrapVCFOutput = null;
+
     @Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false)
     public boolean CHECK_IS_MALE_ON_CHR_X = false;
 

From 53006da9a55a58f1d7f7f66be3b0466848a6f5fc Mon Sep 17 00:00:00 2001
From: David Roazen <droazen@broadinstitute.org>
Date: Wed, 17 Aug 2011 14:55:32 -0400
Subject: [PATCH 2/7] Improved descriptions for the SnpEff annotations in the
 VCF header (based on Eric's feedback).

---
 .../sting/gatk/walkers/annotator/SnpEff.java  | 26 +++++++++----------
 .../VariantAnnotatorIntegrationTest.java      |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
index fc5014885..350c683c2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
@@ -161,19 +161,19 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
 
     public List<VCFInfoHeaderLine> getDescriptions() {
         return Arrays.asList(
-            new VCFInfoHeaderLine(GENE_ID_KEY,                  1, VCFHeaderLineType.String,  "Gene ID"),
-            new VCFInfoHeaderLine(GENE_NAME_KEY,                1, VCFHeaderLineType.String,  "Gene name"),
-            new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY,            1, VCFHeaderLineType.String,  "Transcript ID"),
-            new VCFInfoHeaderLine(EXON_ID_KEY,                  1, VCFHeaderLineType.String,  "Exon ID"),
-            new VCFInfoHeaderLine(EXON_RANK_KEY,                1, VCFHeaderLineType.Integer, "Exon rank"),
-            new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY,   0, VCFHeaderLineType.Flag,    "If present, gene is non-coding"),
-            new VCFInfoHeaderLine(EFFECT_KEY,                   1, VCFHeaderLineType.String,  "One of the most high-impact effects across all transcripts at this site"),
-            new VCFInfoHeaderLine(EFFECT_IMPACT_KEY,            1, VCFHeaderLineType.String,  "Impact of the effect " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
-            new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String,  "Additional information about the effect"),
-            new VCFInfoHeaderLine(OLD_NEW_AA_KEY,               1, VCFHeaderLineType.String,  "Old/New amino acid"),
-            new VCFInfoHeaderLine(OLD_NEW_CODON_KEY,            1, VCFHeaderLineType.String,  "Old/New codon"),
-            new VCFInfoHeaderLine(CODON_NUM_KEY,                1, VCFHeaderLineType.Integer, "Codon number"),
-            new VCFInfoHeaderLine(CDS_SIZE_KEY,                 1, VCFHeaderLineType.Integer, "CDS size")
+            new VCFInfoHeaderLine(GENE_ID_KEY,                  1, VCFHeaderLineType.String,  "Gene ID for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(GENE_NAME_KEY,                1, VCFHeaderLineType.String,  "Gene name for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY,            1, VCFHeaderLineType.String,  "Transcript ID for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(EXON_ID_KEY,                  1, VCFHeaderLineType.String,  "Exon ID for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(EXON_RANK_KEY,                1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY,   0, VCFHeaderLineType.Flag,    "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"),
+            new VCFInfoHeaderLine(EFFECT_KEY,                   1, VCFHeaderLineType.String,  "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
+            new VCFInfoHeaderLine(EFFECT_IMPACT_KEY,            1, VCFHeaderLineType.String,  "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
+            new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String,  "Additional information about the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(OLD_NEW_AA_KEY,               1, VCFHeaderLineType.String,  "Old/New amino acid for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(OLD_NEW_CODON_KEY,            1, VCFHeaderLineType.String,  "Old/New codon for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(CODON_NUM_KEY,                1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(CDS_SIZE_KEY,                 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant")
         );
     }
 }
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index f54bfa40c..832079807 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -133,7 +133,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
             validationDataLocation + "1000G.exomes.vcf --snpEffFile  " + validationDataLocation +
             "snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out -L 1:26,000,000-26,500,000",
             1,
-            Arrays.asList("c08648a078368c80530bff004b3157f1")
+            Arrays.asList("03eae1dab19a9358250890594bf53607")
         );
         executeTest("Testing SnpEff annotations", spec);
     }

From d170187896bc58200a2e827373046b03c4c260cd Mon Sep 17 00:00:00 2001
From: Matt Hanna <hanna@broadinstitute.org>
Date: Wed, 17 Aug 2011 16:16:05 -0400
Subject: [PATCH 3/7] Disable optimization that increases marginal speed of the
 GATK slightly but can produce data loss in a narrow corner case where the
 BGZF block(s) locations and offsets in the last index bucket of contig n
 overlap exactly with the BGZF block locations and offset in the last index
 bucket of contig n+1.

A proper fix that keeps the optimization has already been introduced into
unstable, but disabling the optimization is a low risk way to make sure that
users of stable experience no data loss.
---
 .../gatk/datasources/reads/LowMemoryIntervalSharder.java      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
index ba6321121..198f7d7d3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
@@ -59,8 +59,8 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
      */
     public FilePointer next() {
         FilePointer current = wrappedIterator.next();
-        while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
-            current = current.combine(parser,wrappedIterator.next());
+        //while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
+        //    current = current.combine(parser,wrappedIterator.next());
         return current;
     }
 

From d59e6ed274c555fb4e8d8198449ec0a6eb90de41 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Wed, 17 Aug 2011 16:22:07 -0400
Subject: [PATCH 4/7] Fix for RefSeqCodec bug and better error messages

-- RefSeqCodec bug: getFeatureClass() returned RefSeqCodec.class, not RefSeqFeature.class.  Really should change this in Tribble to require Class<T extends Feature> to get compile time type checking
-- Better error messages that actually list the available tribble types, when there's a type error
---
 .../commandline/ArgumentTypeDescriptor.java   |  14 +-
 .../refdata/features/refseq/RefSeqCodec.java  | 220 +++++++++---------
 .../sting/utils/text/ListFileUtils.java       |   4 +-
 3 files changed, 119 insertions(+), 119 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index d1d4ff914..02af884a2 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -373,16 +373,16 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
                         if ( featureDescriptor != null ) {
                             tribbleType = featureDescriptor.getName();
                             logger.warn("Dynamically determined type of " + file + " to be " + tribbleType);
+                        } else {
+                            throw new UserException.CommandLineException(
+                                    String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
+                                            "Please add an explicit type tag :TYPE listing the correct type from among the supported types: %s",
+                                            manager.userFriendlyListOfAvailableFeatures()));
                         }
                     }
                 }
             }
 
-            if ( tribbleType == null ) // error handling
-                throw new UserException.CommandLineException(
-                        String.format("Could not parse argument %s with value %s",
-                                defaultDefinition.fullName, value));
-
             Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
             Class parameterType = getParameterizedTypeClass(type);
             RodBinding result = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags);
@@ -395,8 +395,8 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
                             value, source.field.getName()));
         } catch (Exception e) {
             throw new UserException.CommandLineException(
-                    String.format("Failed to parse value %s for argument %s.",
-                            value, source.field.getName()));
+                    String.format("Failed to parse value %s for argument %s. Message: %s",
+                            value, source.field.getName(), e.getMessage()));
         }
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java
index 89ee65532..cec40b5bd 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java
@@ -1,110 +1,110 @@
-package org.broadinstitute.sting.gatk.refdata.features.refseq;
-
-import org.apache.commons.io.filefilter.FalseFileFilter;
-import org.broad.tribble.Feature;
-import org.broad.tribble.TribbleException;
-import org.broad.tribble.readers.LineReader;
-import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-
-import java.util.ArrayList;
-
-/**
- * the ref seq codec
- */
-public class RefSeqCodec implements ReferenceDependentFeatureCodec<RefSeqFeature> {
-
-    /**
-     * The parser to use when resolving genome-wide locations.
-     */
-    private GenomeLocParser genomeLocParser;
-    private boolean zero_coding_length_user_warned = false;
-    /**
-     * Set the parser to use when resolving genetic data.
-     * @param genomeLocParser The supplied parser.
-     */
-    @Override
-    public void setGenomeLocParser(GenomeLocParser genomeLocParser) {
-        this.genomeLocParser =  genomeLocParser;
-    }
-
-    @Override
-    public Feature decodeLoc(String line) {
-        if (line.startsWith("#")) return null;
-        String fields[] = line.split("\t");
-        if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length);
-        String contig_name = fields[2];
-        try {
-            return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
-        } catch ( UserException.MalformedGenomeLoc e ) {
-            Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")");
-            return null;
-        }
-    }
-
-    /** Fills this object from a text line in RefSeq (UCSC) text dump file */
-    @Override
-    public RefSeqFeature decode(String line) {
-        if (line.startsWith("#")) return null;
-        String fields[] = line.split("\t");
-
-        // we reference postion 15 in the split array below, make sure we have at least that many columns
-        if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length);
-        String contig_name = fields[2];
-        RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
-
-        feature.setTranscript_id(fields[1]);
-        if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1);
-        else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1);
-        else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line);
-
-        int coding_start = Integer.parseInt(fields[6])+1;
-        int coding_stop = Integer.parseInt(fields[7]);
-
-        if ( coding_start > coding_stop ) {
-            if ( ! zero_coding_length_user_warned ) {
-                Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+
-                        "Such transcripts will be ignored (this warning is printed only once)");
-                zero_coding_length_user_warned = true;
-            }
-            return null;
-        }
-
-        feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
-        feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, coding_start, coding_stop));
-        feature.setGene_name(fields[12]);
-        String[] exon_starts = fields[9].split(",");
-        String[] exon_stops = fields[10].split(",");
-        String[] eframes = fields[15].split(",");
-
-        if ( exon_starts.length != exon_stops.length )
-            throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line);
-        if ( exon_starts.length != eframes.length )
-            throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line);
-
-        ArrayList<GenomeLoc> exons = new ArrayList<GenomeLoc>(exon_starts.length);
-        ArrayList<Integer> exon_frames = new ArrayList<Integer>(eframes.length);
-
-        for ( int i = 0 ; i < exon_starts.length  ; i++ ) {
-            exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
-            exon_frames.add(Integer.decode(eframes[i]));
-        }
-
-        feature.setExons(exons);
-        feature.setExon_frames(exon_frames);
-        return feature;
-    }
-
-    @Override
-    public Object readHeader(LineReader reader) {
-        return null;
-    }
-
-    @Override
-    public Class getFeatureType() {
-        return RefSeqCodec.class;
-    }
-}
+package org.broadinstitute.sting.gatk.refdata.features.refseq;
+
+import org.apache.commons.io.filefilter.FalseFileFilter;
+import org.broad.tribble.Feature;
+import org.broad.tribble.TribbleException;
+import org.broad.tribble.readers.LineReader;
+import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+
+import java.util.ArrayList;
+
+/**
+ * the ref seq codec
+ */
+public class RefSeqCodec implements ReferenceDependentFeatureCodec<RefSeqFeature> {
+
+    /**
+     * The parser to use when resolving genome-wide locations.
+     */
+    private GenomeLocParser genomeLocParser;
+    private boolean zero_coding_length_user_warned = false;
+    /**
+     * Set the parser to use when resolving genetic data.
+     * @param genomeLocParser The supplied parser.
+     */
+    @Override
+    public void setGenomeLocParser(GenomeLocParser genomeLocParser) {
+        this.genomeLocParser =  genomeLocParser;
+    }
+
+    @Override
+    public Feature decodeLoc(String line) {
+        if (line.startsWith("#")) return null;
+        String fields[] = line.split("\t");
+        if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length);
+        String contig_name = fields[2];
+        try {
+            return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
+        } catch ( UserException.MalformedGenomeLoc e ) {
+            Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")");
+            return null;
+        }
+    }
+
+    /** Fills this object from a text line in RefSeq (UCSC) text dump file */
+    @Override
+    public RefSeqFeature decode(String line) {
+        if (line.startsWith("#")) return null;
+        String fields[] = line.split("\t");
+
+        // we reference postion 15 in the split array below, make sure we have at least that many columns
+        if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length);
+        String contig_name = fields[2];
+        RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
+
+        feature.setTranscript_id(fields[1]);
+        if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1);
+        else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1);
+        else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line);
+
+        int coding_start = Integer.parseInt(fields[6])+1;
+        int coding_stop = Integer.parseInt(fields[7]);
+
+        if ( coding_start > coding_stop ) {
+            if ( ! zero_coding_length_user_warned ) {
+                Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+
+                        "Such transcripts will be ignored (this warning is printed only once)");
+                zero_coding_length_user_warned = true;
+            }
+            return null;
+        }
+
+        feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
+        feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, coding_start, coding_stop));
+        feature.setGene_name(fields[12]);
+        String[] exon_starts = fields[9].split(",");
+        String[] exon_stops = fields[10].split(",");
+        String[] eframes = fields[15].split(",");
+
+        if ( exon_starts.length != exon_stops.length )
+            throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line);
+        if ( exon_starts.length != eframes.length )
+            throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line);
+
+        ArrayList<GenomeLoc> exons = new ArrayList<GenomeLoc>(exon_starts.length);
+        ArrayList<Integer> exon_frames = new ArrayList<Integer>(eframes.length);
+
+        for ( int i = 0 ; i < exon_starts.length  ; i++ ) {
+            exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
+            exon_frames.add(Integer.decode(eframes[i]));
+        }
+
+        feature.setExons(exons);
+        feature.setExon_frames(exon_frames);
+        return feature;
+    }
+
+    @Override
+    public Object readHeader(LineReader reader) {
+        return null;
+    }
+
+    @Override
+    public Class getFeatureType() {
+        return RefSeqFeature.class;
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java
index 61d53679a..b0e25e55b 100644
--- a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java
@@ -160,8 +160,8 @@ public class ListFileUtils {
                                 rodBinding.getName(), rodBinding.getTribbleType(), builderForValidation.userFriendlyListOfAvailableFeatures()));
             if ( ! rodBinding.getType().isAssignableFrom(descriptor.getFeatureClass()) )
                 throw new UserException.BadArgumentValue(rodBinding.getName(),
-                        String.format("Field %s expected type %s, but the type of the input file provided on the command line was %s. Please make sure that you have provided the correct file type and/or that you are not binding your rod to a name matching one of the available types.",
-                                rodBinding.getName(), rodBinding.getType(), descriptor.getName()));
+                        String.format("Field %s expected type %s, but the type of the input file provided on the command line was %s producing %s. Please make sure that you have provided the correct file type and/or that you are not binding your rod to a name matching one of the available types.",
+                                rodBinding.getName(), rodBinding.getType(), descriptor.getName(), descriptor.getFeatureClass()));
 
 
             rodBindings.add(triplet);

From c193f52e5de69e8dd837f42aae299a31ed5b016e Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Wed, 17 Aug 2011 16:29:45 -0400
Subject: [PATCH 6/7] Fixed up examples: pasting from wiki still had old rod
 syntax

---
 .../gatk/walkers/beagle/BeagleOutputToVCFWalker.java   | 10 +++++-----
 .../gatk/walkers/beagle/ProduceBeagleInputWalker.java  |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
index aca176bc2..51fe543df 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
@@ -64,11 +64,11 @@ import static java.lang.Math.log10;
  * <pre>
  *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
  *      -R reffile.fasta -T BeagleOutputToVCF \
- *      -B:variant,VCF input_vcf.vcf \
- *      -B:beagleR2,BEAGLE /myrun.beagle_output.r2 \
- *      -B:beaglePhased,BEAGLE /myrun.beagle_output.phased \
- *      -B:beagleProbs,BEAGLE /myrun.beagle_output.gprobs \
- *      --out output_vcf.vcf
+ *      -V input_vcf.vcf \
+ *      -beagleR2:BEAGLE /myrun.beagle_output.r2 \
+ *      -beaglePhased:BEAGLE /myrun.beagle_output.phased \
+ *      -beagleProbs:BEAGLE /myrun.beagle_output.gprobs \
+ *      -o output_vcf.vcf
  *      </pre>
 
  <p> Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them </p>
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
index 6ac817555..07793fd7b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
@@ -68,7 +68,7 @@ import java.util.*;
  * <pre>
  *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
  *      -R reffile.fasta -T ProduceBeagleInput \
- *      -B:variant,VCF path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
+ *      -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
  * </pre>
  *
  */

From 81a792afeb6f6bcdf81437a58417217924aeb61b Mon Sep 17 00:00:00 2001
From: Matt Hanna <hanna@broadinstitute.org>
Date: Wed, 17 Aug 2011 16:58:24 -0400
Subject: [PATCH 7/7] Reverting optimization disable in unstable.

---
 .../gatk/datasources/reads/LowMemoryIntervalSharder.java      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
index 198f7d7d3..ba6321121 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
@@ -59,8 +59,8 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
      */
     public FilePointer next() {
         FilePointer current = wrappedIterator.next();
-        //while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
-        //    current = current.combine(parser,wrappedIterator.next());
+        while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
+            current = current.combine(parser,wrappedIterator.next());
         return current;
     }