First checkin of GenomicAnnotator which annotates an input VCF file by pulling data in a generic way from an arbitrary set of TabularRODs.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3114 348d0f76-0448-11de-a6fe-93d51630548a
2010-04-02 17:49:42 +00:00 · 2010-04-02 17:49:42 +00:00 · 6b7b07f178
parent 699a0ea9d1
commit 6b7b07f178
5 changed files with 556 additions and 13 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@ -123,7 +123,7 @@ public class VariantAnnotator extends LocusWalker<Integer, Integer> {
     */
    public Integer reduceInit() { return 0; }

-    
+
    /**
     * We want reads that span deletions
     *
@ -154,16 +154,20 @@ public class VariantAnnotator extends LocusWalker<Integer, Integer> {
            return 0;

        // if the reference base is not ambiguous, we can annotate
+        Collection<VariantContext> annotatedVCs = Arrays.asList( new VariantContext[] { vc } );
        if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
            Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
            if ( stratifiedContexts != null ) {
-                vc = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
+                annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
+            }
+        }
+
+        if ( variant instanceof RodVCF ) {
+            for(VariantContext annotatedVC : annotatedVCs ) {
+                    vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
            }
        }

-        if ( variant instanceof RodVCF )
-            vcfWriter.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
-        
        return 1;
    }

--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.playground.gatk.walkers.annotator.GenomicAnnotation;
 import org.broadinstitute.sting.utils.PackageUtils;
 import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
@ -33,6 +34,12 @@ public class VariantAnnotatorEngine {
    // how about hapmap3?
    private boolean annotateHapmap3 = false;

+    // command-line option used for GenomicAnnotation.
+    private Map<String, Set<String>> requestedColumnsMap;
+
+    // command-line option used for GenomicAnnotation.
+    private boolean explode;
+

    // use this constructor if you want all possible annotations
    public VariantAnnotatorEngine(GenomeAnalysisEngine engine) {
@ -143,7 +150,7 @@ public class VariantAnnotatorEngine {
        return descriptions;
    }

-    public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
+    public Collection<VariantContext> annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {

        Map<String, Object> infoAnnotations = new HashMap<String, Object>(vc.getAttributes());

@ -166,12 +173,50 @@ public class VariantAnnotatorEngine {
            infoAnnotations.put(VCFRecord.HAPMAP3_KEY, hapmap3.size() == 0 ? "0" : "1");
        }

-        for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) {
-            Map<String, Object> result = annotation.annotate(tracker, ref, stratifiedContexts, vc);
-            if ( result != null )
-                infoAnnotations.putAll(result);
+
+        //Process the info field
+        List<Map<String, Object>> infoAnnotationOutputsList = new LinkedList<Map<String, Object>>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file
+        infoAnnotationOutputsList.add(new HashMap<String, Object>(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file gains 1 line.
+
+        //go through all the requested info annotationTypes
+        for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations )
+        {
+            Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc);
+            if ( annotationsFromCurrentType == null ) {
+                continue;
+            }
+
+            if(annotationType instanceof GenomicAnnotation)
+            {
+                //go through the annotations returned by GenericAnnotation for each -B input file.
+                for( Map.Entry<String, Object> annotationsFromInputFile : annotationsFromCurrentType.entrySet() )
+                {
+                    final String inputFileBindingName = annotationsFromInputFile.getKey();
+                    final List<Map<String, String>> matchingRecords = (List<Map<String, String>>) annotationsFromInputFile.getValue();
+
+                    if( matchingRecords.size() > 1 && explode)
+                    {
+                        //More than one record matched in this file. After this, infoAnnotationOutputsList.size() will be infoAnnotationOutputsList.size()*matchingRecords.size().
+                        infoAnnotationOutputsList = explodeInfoAnnotationOutputsList( infoAnnotationOutputsList, matchingRecords, inputFileBindingName);
+                    }
+                    else
+                    {
+                        //This doesn't change infoAnnotationOutputsList.size(). If more than one record matched, their annotations will
+                        //all be added to the same output line, with keys disambiguated by appending _i .
+                        addToExistingAnnotationOutputs( infoAnnotationOutputsList, matchingRecords, inputFileBindingName);
+                    }
+                }
+            }
+            else
+            {
+                //add the annotations to each output line.
+                for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
+                    infoAnnotationOutput.putAll(annotationsFromCurrentType);
+                }
+            }
        }

+        //Process genotypes
        Map<String, Genotype> genotypes;
        if ( requestedGenotypeAnnotations.size() == 0 ) {
            genotypes = vc.getGenotypes();
@ -195,6 +240,161 @@ public class VariantAnnotatorEngine {
            }
        }

-        return new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotations);
+      //Create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList
+        Collection<VariantContext> returnValue = new LinkedList<VariantContext>();
+        for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
+            returnValue.add( new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotationOutput) );
+        }
+
+        return returnValue;
    }
-}
+
+
+    /**
+     * Implements non-explode mode, where the output lines have a one-to-one relationship
+     * with the input variants, and all multiple-match records are collapsed into the single info field.
+     * The collapsing is done by appending an _i to each key name (where 'i' is a record counter).
+     *
+     * @param infoAnnotationOutputsList
+     * @param matchingRecords
+     * @param bindingName
+     */
+    private void addToExistingAnnotationOutputs(
+            final List<Map<String, Object>> infoAnnotationOutputsList,
+            final List<Map<String, String>> matchingRecords,
+            final String bindingName) {
+        //For each matching record, just add its annotations to all existing output lines.
+        final boolean renameKeys = matchingRecords.size() > 1;
+        for(int i = 0; i < matchingRecords.size(); i++) {
+            Map<String,String> annotationsForRecord = matchingRecords.get(i);
+            annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested.
+
+            if(renameKeys) {
+                //Rename keys to avoid naming conflicts (eg. if you have multiple dbsnp matches,
+                // dbSNP.avHet=value1 from record 1 and dbSNP.avHet=value2 from record 2 will become dbSNP.avHet_1=value1 and dbSNP.avHet_2=value2 )
+                Map<String,String> annotationsForRecordWithRenamedKeys = new HashMap<String, String>();
+                for(Map.Entry<String, String> annotation : annotationsForRecord.entrySet()) {
+                    annotationsForRecordWithRenamedKeys.put(annotation.getKey() + "_" + i, annotation.getValue());
+                }
+
+                annotationsForRecord = annotationsForRecordWithRenamedKeys;
+            }
+
+            //Add the annotations from this record to each output line.
+            for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
+                infoAnnotationOutput.putAll(annotationsForRecord);
+            }
+        }
+    }
+
+    /**
+     * Implements "explode" mode. Takes the current list of
+     * infoAnnotationOutputs (each element of will end up in a different line
+     * of the output VCF file), and generates/returns a new list of infoAnnotationOutputs
+     * which contain one copy of the current infoAnnotationOutputs for each record
+     * in matching records. The returned list will have size:
+     *
+     * infoAnnotationOutputsList.size() * matchingRecords.size()
+     *
+     * See class-level comments for more details.
+     *
+     * @param infoAnnotationOutputsList
+     * @param matchingRecords
+     * @param bindingName
+     * @return
+     */
+    private List<Map<String, Object>> explodeInfoAnnotationOutputsList(
+            final List<Map<String, Object>> infoAnnotationOutputsList,
+            final List<Map<String, String>> matchingRecords,
+            final String bindingName) {
+
+
+        //This is the return value. It represents the new list of lines in the output VCF file.
+        final List<Map<String, Object>> newInfoAnnotationOutputsList = new LinkedList<Map<String, Object>>();
+
+        //For each matching record, generate a new output line
+        for(int i = 0; i < matchingRecords.size(); i++) {
+            Map<String,String> annotationsForRecord = matchingRecords.get(i);
+            annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested.
+
+            //Add the annotations from this record to each output line.
+            for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
+                Map<String, Object> infoAnnotationOutputCopy = new HashMap<String, Object>(infoAnnotationOutput); //create a new copy of this line.
+                infoAnnotationOutputCopy.putAll(annotationsForRecord); //Adds the column-value pairs from this record to this line.
+
+                newInfoAnnotationOutputsList.add(infoAnnotationOutputCopy); //Add the line to the new list of lines.
+            }
+        }
+
+        return newInfoAnnotationOutputsList;
+    }
+
+
+
+    /**
+     * Takes a list of key-value pairs and returns a new Map containing only the columns which were requested by the user
+     * via the -s arg. If there was no -s arg that referenced the given bindingName, all annotationsForRecord returned untouched.
+     *
+     * @param bindingName The binding name for a particular ROD input file.
+     * @param annotationsForRecord The list of column_name -> value pairs for a particular record from the given input file.
+     *
+     * @return Map - see above.
+     */
+    private Map<String, String> selectColumnsFromRecord( String bindingName, Map<String, String> annotationsForRecord) {
+        if(requestedColumnsMap == null || !requestedColumnsMap.containsKey(bindingName)) {
+            return annotationsForRecord;
+        }
+
+        Set<String> requestedColumns = requestedColumnsMap.get(bindingName);
+        Map<String, String> subsettedAnnotations = new HashMap<String, String>();
+        for(Map.Entry<String, String> e : annotationsForRecord.entrySet() ) {
+            if(requestedColumns.contains(e.getKey())) {
+                subsettedAnnotations.put(e.getKey(), e.getValue());
+            }
+        }
+
+        if(subsettedAnnotations.isEmpty()) {
+            throw new StingException("Invalid -s argument for the '" + bindingName + "' input file. " +
+                    "It caused all columns in the file to be rejected. Please check to make sure the -s column " +
+                    "names match the column names in the '" + bindingName + "' file's HEADER line.");
+        }
+
+        return subsettedAnnotations;
+    }
+
+
+
+    /**
+     * Determines how the engine will handle the case where multiple records in a ROD file
+     * overlap a particular single locus. If explode is set to true, the output will be
+     * one-to-many, so that each locus in the input VCF file could result in multiple
+     * entries in the output VCF file. Otherwise, the output will be one-to-one, and
+     * all multiple-match records will be collapsed into the single info field.
+     * The collapsing is done by appending an _i to each key name (where 'i' is a
+     * record counter).
+     *
+     * See class-level comments for more details.
+     *
+     * @param explode
+     */
+    public void setExplode(boolean explode) {
+        this.explode = explode;
+    }
+
+    /**
+     * Sets the columns that will be used for the info annotation field.
+     * Column names should be of the form bindingName.columnName (eg. dbsnp.avHet).
+     *
+     * @param columns An array of strings where each string is a comma-separated list
+     * of columnNames (eg ["dbsnp.avHet,dbsnp.valid", "file2.col1,file3.col1"] ).
+     */
+    public void setRequestedColumns(String[] columns) {
+        if(columns == null) {
+            throw new IllegalArgumentException("columns arg is null. Please check the -s command-line arg.");
+        }
+
+        //System.err.println("COLUMNS:  "+Arrays.asList(columns).toString());
+
+        this.requestedColumnsMap = GenomicAnnotation.parseColumnsArg(columns);
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
 import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
@ -224,7 +225,8 @@ public class UnifiedGenotyperEngine {
                // first off, we want to use the *unfiltered* context for the annotations
                stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup());

-                call.vc = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
+                Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
+                call.vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
            }
        }

--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java
@ -0,0 +1,163 @@
+package org.broadinstitute.sting.playground.gatk.walkers.annotator;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.gatk.refdata.TabularROD;
+import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
+
+/**
+ * TODO comment
+ */
+public class GenomicAnnotation implements InfoFieldAnnotation {
+
+
+    /**
+     * For each ROD (aka. record) which overlaps the current locus, generates a
+     * set of annotations of the form:
+     *
+     * thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A),
+     *
+     * These annotations are stored in a Map<String, String>.
+     *
+     * Since a single input file can have multiple records that overlap the current
+     * locus (eg. dbSNP can have multiple entries for the same location), a different
+     * Map<String, String> is created for each of these, resulting in a List<Map<String, String>>
+     * for each input file.
+     *
+     * The return value of this method is a Map of the form:
+     *     rodName1 -> List<Map<String, String>>
+     *     rodName2 -> List<Map<String, String>>
+     *     rodName3 -> List<Map<String, String>>
+     *     ...
+     *
+     * The List values are guaranteed to have size > 0, and in most cases will have size == 1.
+     */
+    public Map<String, Object> annotate(final RefMetaDataTracker tracker,
+            final ReferenceContext ref,
+            final Map<String, StratifiedAlignmentContext> stratifiedContexts,
+            final VariantContext vc) {
+
+        final Map<String, Object> annotations = new HashMap<String, Object>();
+        for(final GATKFeature gatkFeature : tracker.getAllRods())
+        {
+            final ReferenceOrderedDatum rod = (ReferenceOrderedDatum) gatkFeature.getUnderlyingObject();
+            if(! (rod instanceof TabularROD) ) {
+                continue; //GenericAnnotation only works with TabularRODs so that it can select individual columns
+            }
+
+            final String name = rod.getName();
+            if(name.equals("variant") || name.equals("interval")) {
+                continue;
+            }
+
+
+            final Map<String, String> annotationsForRecord = convertRecordToAnnotations( (TabularROD) rod );
+
+            List<Map<String, String>> listOfMatchingRecords = (List<Map<String, String>>) annotations.get(name);
+            if(listOfMatchingRecords == null) {
+                listOfMatchingRecords = new LinkedList<Map<String,String>>();
+                listOfMatchingRecords.add( annotationsForRecord );
+                annotations.put(name, listOfMatchingRecords);
+            } else {
+                listOfMatchingRecords.add( annotationsForRecord );
+            }
+        }
+
+        return annotations;
+    }
+
+
+
+
+    /**
+     * Converts the ROD to a set of key-value pairs of the form:
+     *   thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue
+     *   (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A)
+     *
+     * @param rod A TabularROD corresponding to one record in one input file.
+     *
+     * @return The map of column-name -> value pairs.
+     */
+    private Map<String, String> convertRecordToAnnotations( final TabularROD rod ) {
+        final String rodName = rod.getName(); //aka the rod binding
+
+        final Map<String, String> result = new HashMap<String, String>();
+        for(final Entry<String, String> entry : rod.entrySet()) {
+            result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue());
+        }
+
+        return result;
+    }
+
+    public static String generateInfoFieldKey(String rodBindingName, String columnName ) {
+        return rodBindingName + "." + columnName;
+    }
+
+
+    /**
+     * Parses the columns arg and returns a Map of columns hashed by their binding name.
+     * For example:
+     *   The command line:
+     *      -s dbSnp.valid,dbsnp.avHet -s refGene.txStart,refGene.txEnd
+     *
+     *   will be passed to this method as:
+     *       ["dbSnp.valid,dbsnp.avHet", "refGene.txStart,refGene.txEnd"]
+     *
+     *   resulting in a return value of:
+     *      {
+     *       "dbSnp" -> "dbSnp.valid" ,
+     *       "dbSnp" -> "dbsnp.avHet" ,
+     *       "refGene" -> "refGene.txStart",
+     *       "refGene" -> "refGene.txEnd"
+     *      }
+     *
+     * @param columnsArg The -s command line arg value.
+     *
+     * @return Map representing a parsed version of this arg - see above.
+     */
+    public static Map<String, Set<String>> parseColumnsArg(String[] columnsArg) {
+        Map<String, Set<String>> result = new HashMap<String, Set<String>>();
+
+        for(String s : columnsArg) {
+            for(String columnSpecifier : s.split(",") ) {
+                String[] rodNameColumnName = columnSpecifier.split("\\.");
+                if(rodNameColumnName.length != 2) {
+                    throw new IllegalArgumentException("The following column specifier in the -s arg is invalid: [" + columnSpecifier + "]. It must be of the form 'bindingName.columnName'.");
+                }
+                String rodName = rodNameColumnName[0];
+                //String columnName = rodNameColumnName[1];
+
+                Set<String> requestedColumns = result.get(rodName);
+                if(requestedColumns == null) {
+                    requestedColumns = new HashSet<String>();
+                    result.put(rodName, requestedColumns);
+                }
+                requestedColumns.add(columnSpecifier);
+            }
+        }
+
+        return result;
+    }
+
+    public VCFInfoHeaderLine getDescription() {
+        return new VCFInfoHeaderLine("GenericAnnotation", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position. ");
+    }
+
+    public String getKeyName() {
+        return "GenericAnnotation";
+    }
+
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java
@ -0,0 +1,174 @@
+    package org.broadinstitute.sting.playground.gatk.walkers.annotator;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
+import org.broadinstitute.sting.gatk.walkers.Allows;
+import org.broadinstitute.sting.gatk.walkers.By;
+import org.broadinstitute.sting.gatk.walkers.DataSource;
+import org.broadinstitute.sting.gatk.walkers.Reference;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.gatk.walkers.Window;
+import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
+
+
+/**
+ * Annotates variant calls with context information.  Users can specify which of the available annotations to use.
+ */
+//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class))
+@Allows(value={DataSource.READS, DataSource.REFERENCE})
+@Reference(window=@Window(start=-50,stop=50))
+@By(DataSource.REFERENCE)
+public class GenomicAnnotator extends RodWalker<Integer, Integer> {
+    @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
+    protected File VCF_OUT;
+    @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
+    protected String sampleName = null;
+
+    @Argument(fullName="select", shortName="s", doc="Select which columns to use for each ROD file. Column #s are 0-based. (eg. The following will select columns 5,6,2 from file1.txt and columns 3,7 from file2.txt: -B my-rod,table,/path/file1.txt -B my-rod2,table,/path/file2.txt -S my-rod={5,6,2} -S my-rod2={3,7})", required=false)
+    protected String[] COLUMNS = {};
+
+    @Argument(fullName="explode", shortName="exp", doc="If more than one record from the same file matches a particular locus, create multiple entries in the ouptut file - one for each match. WARNING: This could lead to combinatorial explotion if more than one file have more than one match at a particular locus.", required=false)
+    protected Boolean EXPLODE = false;
+
+    private VCFWriter vcfWriter;
+
+    private HashMap<String, String> nonVCFsampleName = new HashMap<String, String>();
+
+    private VariantAnnotatorEngine engine;
+
+
+    /**
+     * Prepare the output file and the list of available features.
+     */
+    public void initialize() {
+
+        // get the list of all sample names from the various VCF input rods
+        TreeSet<String> samples = new TreeSet<String>();
+        SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
+
+        // add the non-VCF sample from the command-line, if applicable
+        if ( sampleName != null  ) {
+            nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
+            samples.add(sampleName.toUpperCase());
+        }
+
+        // if there are no valid samples, warn the user
+        if ( samples.size() == 0 ) {
+            logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
+        }
+
+        engine = new VariantAnnotatorEngine(getToolkit(), new String[] { }, new String[] { "GenomicAnnotation" });
+
+        engine.setExplode( Boolean.TRUE.equals( EXPLODE ) );
+        engine.setRequestedColumns(COLUMNS);
+
+        // setup the header fields
+        Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
+        hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
+        hInfo.add(new VCFHeaderLine("source", "Annotator"));
+        hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
+        hInfo.addAll(engine.getVCFAnnotationDescriptions());
+
+        vcfWriter = new VCFWriter(VCF_OUT);
+        VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
+        vcfWriter.writeHeader(vcfHeader);
+    }
+
+    /**
+     * Initialize the number of loci processed to zero.
+     *
+     * @return 0
+     */
+    public Integer reduceInit() { return 0; }
+
+
+    /**
+     * We want reads that span deletions
+     *
+     * @return true
+     */
+    public boolean includeReadsWithDeletionAtLoci() { return true; }
+
+    /**
+     * For each site of interest, annotate based on the requested annotation types
+     *
+     * @param tracker  the meta-data tracker
+     * @param ref      the reference base
+     * @param context  the context for the given locus
+     * @return 1 if the locus was successfully processed, 0 if otherwise
+     */
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        if ( tracker == null )
+            return 0;
+
+
+        List<Object> rods = tracker.getReferenceMetaData("variant");
+        // ignore places where we don't have a variant
+        if ( rods.size() == 0 )
+            return 0;
+
+        Object variant = rods.get(0);
+        VariantContext vc = VariantContextAdaptors.toVariantContext("variant", variant);
+        if ( vc == null )
+            return 0;
+
+        // if the reference base is not ambiguous, we can annotate
+        Collection<VariantContext> annotatedVCs = null;
+        if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
+            Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
+            if ( stratifiedContexts != null ) {
+                annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
+            }
+        }
+
+        for(VariantContext annotatedVC : annotatedVCs) {
+            vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
+        }
+
+        return 1;
+    }
+
+    /**
+     * Increment the number of loci processed.
+     *
+     * @param value result of the map.
+     * @param sum   accumulator for the reduce.
+     * @return the new number of loci processed.
+     */
+    public Integer reduce(Integer value, Integer sum) {
+        return sum + value;
+    }
+
+    /**
+     * Tell the user the number of loci processed and close out the new variants file.
+     *
+     * @param result  the number of loci seen.
+     */
+    public void onTraversalDone(Integer result) {
+        out.printf("Processed %d loci.\n", result);
+
+        vcfWriter.close();
+    }
+}
+