From 60ebe68aff12290f18527faed04f3f7bb356d962 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 12 Sep 2011 09:43:23 -0400 Subject: [PATCH 1/6] Fixing issue in VariantEval in which insertion and deletion events weren't treated symmetrically. Added new option to require strict allele matching. --- .../gatk/walkers/varianteval/VariantEvalWalker.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 0d09b7033..266b97af0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -55,7 +55,7 @@ import java.util.*; * *

Output

*

- * Evaluation tables. + * Evaluation tables detailing the results of the eval modules which were applied. *

* *

Examples

@@ -152,6 +152,9 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) private File ancestralAlignmentsFile = null; + @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false) + private boolean requireStrictAlleleMatch = false; + // Variables private Set jexlExpressions = new TreeSet(); @@ -360,16 +363,16 @@ public class VariantEvalWalker extends RodWalker implements Tr if ( matchingComps.size() == 0 ) return null; - // find the comp which matches the alternate allele from eval + // find the comp which matches both the reference allele and alternate allele from eval Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0); for ( VariantContext comp : matchingComps ) { Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0); - if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp)) ) + if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) ) return comp; } - // if none match, just return the first one - return matchingComps.get(0); + // if none match, just return the first one unless we require a strict match + return (requireStrictAlleleMatch ? null : matchingComps.get(0)); } public Integer treeReduce(Integer lhs, Integer rhs) { return null; } From 981b78ea50708139cc3157ccff990fca6cb3e7e8 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 12 Sep 2011 12:17:43 -0400 Subject: [PATCH 2/6] Changing the VQSR command line syntax back to the parsed tags approach. This cleans up the code and makes sure we won't be parsing the same rod file multiple times. I've tried to update the appropriate qscripts. --- .../variantrecalibration/TrainingSet.java | 76 ++++++++++++++++ .../VariantDataManager.java | 88 +++++++++---------- .../VariantRecalibrator.java | 66 ++++---------- ...ntRecalibrationWalkersIntegrationTest.java | 13 ++- .../MethodsDevelopmentCallingPipeline.scala | 10 +-- 5 files changed, 147 insertions(+), 106 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java new file mode 100755 index 000000000..5f688d001 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/12/11 + */ + +public class TrainingSet { + + public RodBinding rodBinding; + public boolean isKnown = false; + public boolean isTraining = false; + public boolean isAntiTraining = false; + public boolean isTruth = false; + public boolean isConsensus = false; + public double prior = 0.0; + + protected final static Logger logger = Logger.getLogger(TrainingSet.class); + + public TrainingSet( final RodBinding rodBinding) { + this.rodBinding = rodBinding; + + final Tags tags = rodBinding.getTags(); + final String name = rodBinding.getName(); + + // Parse the tags to decide which tracks have which properties + if( tags != null ) { + isKnown = tags.containsKey("known") && tags.getValue("known").equals("true"); + isTraining = tags.containsKey("training") && tags.getValue("training").equals("true"); + isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true"); + isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true"); + isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true"); + prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior ); + } + + // Report back to the user which tracks were found and the properties that were detected + if( !isConsensus && !isAntiTraining ) { + logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) ); + } else if( isConsensus ) { + logger.info( String.format( "Found consensus track: %s", name) ); + } else { + logger.info( String.format( "Found bad sites training track: %s", name) ); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 429becfc7..e04bfab76 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -51,10 +51,10 @@ public class VariantDataManager { private ExpandingArrayList data; private final double[] meanVector; private final double[] varianceVector; // this is really the standard deviation - public final ArrayList annotationKeys; + public final List annotationKeys; private final VariantRecalibratorArgumentCollection VRAC; protected final static Logger logger = Logger.getLogger(VariantDataManager.class); - + protected final List trainingSets; public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { this.data = null; @@ -62,6 +62,7 @@ public class VariantDataManager { this.VRAC = VRAC; meanVector = new double[this.annotationKeys.size()]; varianceVector = new double[this.annotationKeys.size()]; + trainingSets = new ArrayList(); } public void setData( final ExpandingArrayList data ) { @@ -104,6 +105,31 @@ public class VariantDataManager { } } + public void addTrainingSet( final TrainingSet trainingSet ) { + trainingSets.add( trainingSet ); + } + + public boolean checkHasTrainingSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTraining ) { return true; } + } + return false; + } + + public boolean checkHasTruthSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTruth ) { return true; } + } + return false; + } + + public boolean checkHasKnownSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isKnown ) { return true; } + } + return false; + } + public ExpandingArrayList getTrainingData() { final ExpandingArrayList trainingData = new ExpandingArrayList(); for( final VariantDatum datum : data ) { @@ -232,57 +258,35 @@ public class VariantDataManager { return value; } - public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap rodToPriorMap, - final List> training, final List> truth, final List> known, final List> badSites, final List> resource) { + public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) { datum.isKnown = false; datum.atTruthSite = false; datum.atTrainingSite = false; datum.atAntiTrainingSite = false; datum.prior = 2.0; - //BUGBUG: need to clean this up - - for( final RodBinding rod : training ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { + for( final TrainingSet trainingSet : trainingSets ) { + for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) { if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.atTrainingSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); + datum.isKnown = datum.isKnown || trainingSet.isKnown; + datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth; + datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining; + datum.prior = Math.max( datum.prior, trainingSet.prior ); + datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 ); } - } - } - for( final RodBinding rod : truth ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.atTruthSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : known ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.isKnown = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : resource ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : badSites ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { if( trainVC != null ) { - datum.atAntiTrainingSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); + datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining; } } } } + private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { + return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && + ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); + } + public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { for( final VariantDatum datum : data ) { RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", @@ -290,10 +294,4 @@ public class VariantDataManager { (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"))); } } - - private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { - return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && - ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index df4faebd1..529d17285 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -77,16 +77,15 @@ import java.util.*; *

* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. * - *

Examples

+ *

Example

*
  * java -Xmx4g -jar GenomeAnalysisTK.jar \
  *   -T VariantRecalibrator \
  *   -R reference/human_g1k_v37.fasta \
  *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
- *   -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
- *   -known:prior=8.0 dbsnp_132.b37.vcf \
+ *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
  *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
  *   -recalFile path/to/output.recal \
  *   -tranchesFile path/to/output.tranches \
@@ -112,34 +111,11 @@ public class VariantRecalibrator extends RodWalker> input;
 
     /**
-     * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
-     */
-    @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
-    public List> training;
-
-    /**
-     * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
-     * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
-     */
-    @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
-    public List> truth;
-
-    /**
-     * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
-     * The output metrics are stratified by known status in order to aid in comparisons with other call sets.
-     */
-    @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
-    public List> known = Collections.emptyList();
-
-    /**
-     * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
-     * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
-     */
-    @Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
-    public List> badSites = Collections.emptyList();
-
-    /**
-     * Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
+     * Any set of VCF files to use as lists of training, truth, or known sites.
+     * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
+     * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
+     * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
+     * Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
      */
     @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
     public List> resource = Collections.emptyList();
@@ -205,7 +181,6 @@ public class VariantRecalibrator extends RodWalker ignoreInputFilterSet = new TreeSet();
     private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
-    private final HashMap rodToPriorMap = new HashMap();
 
     //---------------------------------------------------------------------------------------------------------------
     //
@@ -227,18 +202,15 @@ public class VariantRecalibrator extends RodWalker> allInputBindings = new ArrayList>();
-        allInputBindings.addAll(truth);
-        allInputBindings.addAll(training);
-        allInputBindings.addAll(known);
-        allInputBindings.addAll(badSites);
-        allInputBindings.addAll(resource);
-        for( final RodBinding rod : allInputBindings ) {
-            try {
-                rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
-            } catch( NumberFormatException e ) {
-                throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
-            }
+        for( RodBinding rod : resource ) {
+            dataManager.addTrainingSet( new TrainingSet( rod ) );
+        }
+
+        if( !dataManager.checkHasTrainingSet() ) {
+            throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
+        }
+        if( !dataManager.checkHasTruthSet() ) {
+            throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
         }
     }
 
@@ -270,7 +242,7 @@ public class VariantRecalibrator extends RodWalker= 10) {

From 9d9d438bc4b1804631126565ad15f4a69f649b0b Mon Sep 17 00:00:00 2001
From: David Roazen 
Date: Mon, 12 Sep 2011 12:28:23 -0400
Subject: [PATCH 3/6] New VariantAnnotatorEngine capability: an initialize()
 method for all annotation classes.

All VariantAnnotator annotation classes may now have an (optional) initialize() method
that gets called by the VariantAnnotatorEngine ONCE before annotation starts.

As an example of how this can be used, the SnpEff annotation class will use the initialize()
method to check whether the SnpEff version number stored in the vcf header is a supported
version, and also to verify that its required RodBinding is present.
---
 .../gatk/walkers/annotator/VariantAnnotator.java  |  3 +++
 .../walkers/annotator/VariantAnnotatorEngine.java | 15 +++++++++++----
 .../interfaces/AnnotatorCompatibleWalker.java     |  1 +
 .../interfaces/VariantAnnotatorAnnotation.java    |  9 +++------
 .../gatk/walkers/genotyper/UnifiedGenotyper.java  |  1 +
 5 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index 96a400c68..971727727 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -86,6 +86,7 @@ public class VariantAnnotator extends RodWalker implements Ann
 
     @ArgumentCollection
     protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
+    public RodBinding getVariantRodBinding() { return variantCollection.variants; }
 
     /**
      * The INFO field will be annotated with information on the most biologically-significant effect
@@ -208,6 +209,8 @@ public class VariantAnnotator extends RodWalker implements Ann
             engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
         engine.initializeExpressions(expressionsToUse);
 
+        engine.invokeAnnotationInitializationMethods();
+
         // setup the header fields
         // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
         Set hInfo = new HashSet();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index 01926a7f3..17830f129 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -29,10 +29,7 @@ import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
 import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.Genotype;
@@ -113,6 +110,16 @@ public class VariantAnnotatorEngine {
             dbAnnotations.put(rod, rod.getName());
     }
 
+    public void invokeAnnotationInitializationMethods() {
+        for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
+            annotation.initialize(walker);
+        }
+
+        for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
+            annotation.initialize(walker);
+        }
+    }
+
     public Set getVCFAnnotationDescriptions() {
 
         Set descriptions = new HashSet();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
index 20a2aea0e..9dda57ae3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
@@ -9,6 +9,7 @@ import java.util.List;
 public interface AnnotatorCompatibleWalker {
 
     // getter methods for various used bindings
+    public abstract RodBinding getVariantRodBinding();
     public abstract RodBinding getSnpEffRodBinding();
     public abstract RodBinding getDbsnpRodBinding();
     public abstract List> getCompRodBindings();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
index f33d61df9..9e48de9c3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
@@ -24,18 +24,15 @@
 
 package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
 
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
 import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 
 import java.util.List;
-import java.util.Map;
 
 @DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations")
 public abstract class VariantAnnotatorAnnotation {
     // return the INFO keys
     public abstract List getKeyNames();
+
+    // initialization method (optional for subclasses, and therefore non-abstract)
+    public void initialize ( AnnotatorCompatibleWalker walker ) { }
 }
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
index d5dbdedd6..4ee2d5f44 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@@ -127,6 +127,7 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; }
+    public RodBinding getVariantRodBinding() { return null; }
     public RodBinding getSnpEffRodBinding() { return null; }
     public List> getCompRodBindings() { return Collections.emptyList(); }
     public List> getResourceRodBindings() { return Collections.emptyList(); }

From ec4b30de6d8e9634ca0014c65215460bda066b64 Mon Sep 17 00:00:00 2001
From: Eric Banks 
Date: Mon, 12 Sep 2011 14:45:53 -0400
Subject: [PATCH 4/6] Patch from Laurent: typo leads to bad error messages.

---
 .../sting/commandline/ArgumentTypeDescriptor.java               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index 16358d05f..5fff8f609 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -379,7 +379,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
                     }
 
                     if ( tribbleType == null )
-                        if ( ! file.canRead() | !! file.isFile() ) {
+                        if ( ! file.canRead() | ! file.isFile() ) {
                             throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
                         } else {
                             throw new UserException.CommandLineException(

From 4e116760f4f8577bfaf677ed7711cfc599a6c90d Mon Sep 17 00:00:00 2001
From: Eric Banks 
Date: Mon, 12 Sep 2011 15:09:25 -0400
Subject: [PATCH 5/6] Removing some old cruft from the packages dir.  Updating
 AnalyzeCovariates to include all Covariates.

---
 public/packages/AnalyzeCovariates.xml         |  5 +---
 .../packages/FindContaminatingReadGroups.xml  | 10 -------
 public/packages/GATKResources.xml             | 20 --------------
 public/packages/IndelGenotyper.xml            | 11 --------
 .../packages/LocalRealignmentAroundIndels.xml | 12 ---------
 .../packages/QualityScoresRecalibration.xml   | 18 -------------
 public/packages/RMDIndexer.xml                | 13 ----------
 public/packages/UnifiedGenotyper.xml          | 11 --------
 public/packages/VariantAnnotator.xml          | 26 -------------------
 public/packages/VariantEval.xml               | 18 -------------
 public/packages/VariantFiltration.xml         | 13 ----------
 public/packages/VariantRecalibration.xml      | 12 ---------
 12 files changed, 1 insertion(+), 168 deletions(-)
 delete mode 100644 public/packages/FindContaminatingReadGroups.xml
 delete mode 100755 public/packages/GATKResources.xml
 delete mode 100644 public/packages/IndelGenotyper.xml
 delete mode 100644 public/packages/LocalRealignmentAroundIndels.xml
 delete mode 100644 public/packages/QualityScoresRecalibration.xml
 delete mode 100644 public/packages/RMDIndexer.xml
 delete mode 100644 public/packages/UnifiedGenotyper.xml
 delete mode 100644 public/packages/VariantAnnotator.xml
 delete mode 100644 public/packages/VariantEval.xml
 delete mode 100644 public/packages/VariantFiltration.xml
 delete mode 100644 public/packages/VariantRecalibration.xml

diff --git a/public/packages/AnalyzeCovariates.xml b/public/packages/AnalyzeCovariates.xml
index 7e31934df..a6675a63d 100644
--- a/public/packages/AnalyzeCovariates.xml
+++ b/public/packages/AnalyzeCovariates.xml
@@ -6,10 +6,7 @@
     
       
       
-      
-      
-      
-      
+      
     
   
   
diff --git a/public/packages/FindContaminatingReadGroups.xml b/public/packages/FindContaminatingReadGroups.xml
deleted file mode 100644
index 880f64a81..000000000
--- a/public/packages/FindContaminatingReadGroups.xml
+++ /dev/null
@@ -1,10 +0,0 @@
-
-
-  
-    
-    
-    
-      
-    
-  
-
diff --git a/public/packages/GATKResources.xml b/public/packages/GATKResources.xml
deleted file mode 100755
index 87e6e0e50..000000000
--- a/public/packages/GATKResources.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-  
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-   
-
diff --git a/public/packages/IndelGenotyper.xml b/public/packages/IndelGenotyper.xml
deleted file mode 100644
index c9e3ae0f6..000000000
--- a/public/packages/IndelGenotyper.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-    
-  
-
diff --git a/public/packages/LocalRealignmentAroundIndels.xml b/public/packages/LocalRealignmentAroundIndels.xml
deleted file mode 100644
index 46960e69f..000000000
--- a/public/packages/LocalRealignmentAroundIndels.xml
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-    
-  
-
diff --git a/public/packages/QualityScoresRecalibration.xml b/public/packages/QualityScoresRecalibration.xml
deleted file mode 100644
index 95e8b7c63..000000000
--- a/public/packages/QualityScoresRecalibration.xml
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-      
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/RMDIndexer.xml b/public/packages/RMDIndexer.xml
deleted file mode 100644
index 5d40876de..000000000
--- a/public/packages/RMDIndexer.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/UnifiedGenotyper.xml b/public/packages/UnifiedGenotyper.xml
deleted file mode 100644
index 67a17640c..000000000
--- a/public/packages/UnifiedGenotyper.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-    
-  
-
diff --git a/public/packages/VariantAnnotator.xml b/public/packages/VariantAnnotator.xml
deleted file mode 100644
index 88c0701f0..000000000
--- a/public/packages/VariantAnnotator.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/VariantEval.xml b/public/packages/VariantEval.xml
deleted file mode 100644
index 791066fb7..000000000
--- a/public/packages/VariantEval.xml
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-      
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/VariantFiltration.xml b/public/packages/VariantFiltration.xml
deleted file mode 100644
index 48fa0ff37..000000000
--- a/public/packages/VariantFiltration.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/VariantRecalibration.xml b/public/packages/VariantRecalibration.xml
deleted file mode 100644
index 6fe6b1eff..000000000
--- a/public/packages/VariantRecalibration.xml
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-    
-  
-

From e63d9d8f8edeaab24ad3bff06a97cc15b14b0043 Mon Sep 17 00:00:00 2001
From: Matt Hanna 
Date: Mon, 12 Sep 2011 21:50:59 -0400
Subject: [PATCH 6/6] Mauricio pointed out to me that dynamic merging the
 unmapped regions of multiple BAMs ('-L unmapped' with a BAM list) was
 completely broken.  Sorry about this!  Fixed.

---
 .../gatk/datasources/reads/BAMScheduler.java  |  3 +-
 .../gatk/datasources/reads/GATKBAMIndex.java  | 39 +++++++++++++++++++
 .../datasources/reads/ReadShardStrategy.java  | 21 ++--------
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
index 467aebac5..47eb55b28 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
@@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
 
 import net.sf.picard.util.PeekableIterator;
 import net.sf.samtools.GATKBAMFileSpan;
+import net.sf.samtools.GATKChunk;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocSortedSet;
 
@@ -84,7 +85,7 @@ public class BAMScheduler implements Iterator {
             if(currentLocus == GenomeLoc.UNMAPPED) {
                 nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
                 for(SAMReaderID id: dataSource.getReaderIDs())
-                    nextFilePointer.addFileSpans(id,new GATKBAMFileSpan());
+                    nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
                 currentLocus = null;
                 continue;
             }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
index 5d0c38b78..dc703ff23 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
@@ -215,6 +215,45 @@ public class GATKBAMIndex {
         return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
     }
 
+    /**
+     * Use to get close to the unmapped reads at the end of a BAM file.
+     * @return The file offset of the first record in the last linear bin, or -1
+     * if there are no elements in linear bins (i.e. no mapped reads).
+     */
+    public long getStartOfLastLinearBin() {
+        openIndexFile();
+
+        seek(4);
+
+        final int sequenceCount = readInteger();
+        // Because no reads may align to the last sequence in the sequence dictionary,
+        // grab the last element of the linear index for each sequence, and return
+        // the last one from the last sequence that has one.
+        long lastLinearIndexPointer = -1;
+        for (int i = 0; i < sequenceCount; i++) {
+            // System.out.println("# Sequence TID: " + i);
+            final int nBins = readInteger();
+            // System.out.println("# nBins: " + nBins);
+            for (int j1 = 0; j1 < nBins; j1++) {
+                // Skip bin #
+                skipBytes(4);
+                final int nChunks = readInteger();
+                // Skip chunks
+                skipBytes(16 * nChunks);
+            }
+            final int nLinearBins = readInteger();
+            if (nLinearBins > 0) {
+                // Skip to last element of list of linear bins
+                skipBytes(8 * (nLinearBins - 1));
+                lastLinearIndexPointer = readLongs(1)[0];
+            }
+        }
+
+        closeIndexFile();
+
+        return lastLinearIndexPointer;
+    }
+
     /**
      * Gets the possible number of bins for a given reference sequence.
      * @return How many bins could possibly be used according to this indexing scheme to index a single contig.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
index c2235ec73..5ea75dbb0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
@@ -134,24 +134,11 @@ public class ReadShardStrategy implements ShardStrategy {
             Map selectedReaders = new HashMap();
             while(selectedReaders.size() == 0 && currentFilePointer != null) {
                 shardPosition = currentFilePointer.fileSpans;
+
                 for(SAMReaderID id: shardPosition.keySet()) {
-                    // If the region contains location information (in other words, it is not at
-                    // the start of the unmapped region), add the region.
-                    if(currentFilePointer.isRegionUnmapped) {
-                        // If the region is unmapped and no location data exists, add a null as an indicator to
-                        // start at the next unmapped region.
-                        if(!isIntoUnmappedRegion) {
-                            selectedReaders.put(id,null);
-                            isIntoUnmappedRegion = true;
-                        }
-                        else
-                            selectedReaders.put(id,position.get(id));
-                    }
-                    else {
-                        SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
-                        if(!fileSpan.isEmpty())
-                            selectedReaders.put(id,fileSpan);
-                    }
+                    SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
+                    if(!fileSpan.isEmpty())
+                        selectedReaders.put(id,fileSpan);
                 }
 
                 if(selectedReaders.size() > 0) {