First checkin of GenomicAnnotator which annotates an input VCF file by pulling data in a generic way from an arbitrary set of TabularRODs.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3114 348d0f76-0448-11de-a6fe-93d51630548a
2010-04-02 17:49:42 +00:00 · 2010-04-02 17:49:42 +00:00 · 6b7b07f178
parent 699a0ea9d1
commit 6b7b07f178
5 changed files with 556 additions and 13 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@ -123,7 +123,7 @@ public class VariantAnnotator extends LocusWalker<Integer, Integer> {
     */
    public Integer reduceInit() { return 0; }
-    
+
    /**
     * We want reads that span deletions
     *
@ -154,16 +154,20 @@ public class VariantAnnotator extends LocusWalker<Integer, Integer> {
            return 0;
        // if the reference base is not ambiguous, we can annotate
        Collection<VariantContext> annotatedVCs = Arrays.asList( new VariantContext[] { vc } );
        if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
            Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
            if ( stratifiedContexts != null ) {
-                vc = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
+                annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
            }
        }
        if ( variant instanceof RodVCF ) {
            for(VariantContext annotatedVC : annotatedVCs ) {
                    vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
            }
        }
        if ( variant instanceof RodVCF )
            vcfWriter.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
        return 1;
    }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
 import org.broadinstitute.sting.playground.gatk.walkers.annotator.GenomicAnnotation;
 import org.broadinstitute.sting.utils.PackageUtils;
 import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
@ -33,6 +34,12 @@ public class VariantAnnotatorEngine {
    // how about hapmap3?
    private boolean annotateHapmap3 = false;
    // command-line option used for GenomicAnnotation.
    private Map<String, Set<String>> requestedColumnsMap;
    // command-line option used for GenomicAnnotation.
    private boolean explode;
    // use this constructor if you want all possible annotations
    public VariantAnnotatorEngine(GenomeAnalysisEngine engine) {
@ -143,7 +150,7 @@ public class VariantAnnotatorEngine {
        return descriptions;
    }
-    public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
+    public Collection<VariantContext> annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
        Map<String, Object> infoAnnotations = new HashMap<String, Object>(vc.getAttributes());
@ -166,12 +173,50 @@ public class VariantAnnotatorEngine {
            infoAnnotations.put(VCFRecord.HAPMAP3_KEY, hapmap3.size() == 0 ? "0" : "1");
        }
-        for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) {
+
-            Map<String, Object> result = annotation.annotate(tracker, ref, stratifiedContexts, vc);
+        //Process the info field
-            if ( result != null )
+        List<Map<String, Object>> infoAnnotationOutputsList = new LinkedList<Map<String, Object>>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file
-                infoAnnotations.putAll(result);
+        infoAnnotationOutputsList.add(new HashMap<String, Object>(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file gains 1 line.
        //go through all the requested info annotationTypes
        for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations )
        {
            Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc);
            if ( annotationsFromCurrentType == null ) {
                continue;
            }
            if(annotationType instanceof GenomicAnnotation)
            {
                //go through the annotations returned by GenericAnnotation for each -B input file.
                for( Map.Entry<String, Object> annotationsFromInputFile : annotationsFromCurrentType.entrySet() )
                {
                    final String inputFileBindingName = annotationsFromInputFile.getKey();
                    final List<Map<String, String>> matchingRecords = (List<Map<String, String>>) annotationsFromInputFile.getValue();
                    if( matchingRecords.size() > 1 && explode)
                    {
                        //More than one record matched in this file. After this, infoAnnotationOutputsList.size() will be infoAnnotationOutputsList.size()*matchingRecords.size().
                        infoAnnotationOutputsList = explodeInfoAnnotationOutputsList( infoAnnotationOutputsList, matchingRecords, inputFileBindingName);
                    }
                    else
                    {
                        //This doesn't change infoAnnotationOutputsList.size(). If more than one record matched, their annotations will
                        //all be added to the same output line, with keys disambiguated by appending _i .
                        addToExistingAnnotationOutputs( infoAnnotationOutputsList, matchingRecords, inputFileBindingName);
                    }
                }
            }
            else
            {
                //add the annotations to each output line.
                for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
                    infoAnnotationOutput.putAll(annotationsFromCurrentType);
                }
            }
        }
        //Process genotypes
        Map<String, Genotype> genotypes;
        if ( requestedGenotypeAnnotations.size() == 0 ) {
            genotypes = vc.getGenotypes();
@ -195,6 +240,161 @@ public class VariantAnnotatorEngine {
            }
        }
-        return new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotations);
+      //Create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList
        Collection<VariantContext> returnValue = new LinkedList<VariantContext>();
        for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
            returnValue.add( new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.getFilters(), infoAnnotationOutput) );
        }
        return returnValue;
    }
-}
+
    /**
     * Implements non-explode mode, where the output lines have a one-to-one relationship
     * with the input variants, and all multiple-match records are collapsed into the single info field.
     * The collapsing is done by appending an _i to each key name (where 'i' is a record counter).
     *
     * @param infoAnnotationOutputsList
     * @param matchingRecords
     * @param bindingName
     */
    private void addToExistingAnnotationOutputs(
            final List<Map<String, Object>> infoAnnotationOutputsList,
            final List<Map<String, String>> matchingRecords,
            final String bindingName) {
        //For each matching record, just add its annotations to all existing output lines.
        final boolean renameKeys = matchingRecords.size() > 1;
        for(int i = 0; i < matchingRecords.size(); i++) {
            Map<String,String> annotationsForRecord = matchingRecords.get(i);
            annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested.
            if(renameKeys) {
                //Rename keys to avoid naming conflicts (eg. if you have multiple dbsnp matches,
                // dbSNP.avHet=value1 from record 1 and dbSNP.avHet=value2 from record 2 will become dbSNP.avHet_1=value1 and dbSNP.avHet_2=value2 )
                Map<String,String> annotationsForRecordWithRenamedKeys = new HashMap<String, String>();
                for(Map.Entry<String, String> annotation : annotationsForRecord.entrySet()) {
                    annotationsForRecordWithRenamedKeys.put(annotation.getKey() + "_" + i, annotation.getValue());
                }
                annotationsForRecord = annotationsForRecordWithRenamedKeys;
            }
            //Add the annotations from this record to each output line.
            for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
                infoAnnotationOutput.putAll(annotationsForRecord);
            }
        }
    }
    /**
     * Implements "explode" mode. Takes the current list of
     * infoAnnotationOutputs (each element of will end up in a different line
     * of the output VCF file), and generates/returns a new list of infoAnnotationOutputs
     * which contain one copy of the current infoAnnotationOutputs for each record
     * in matching records. The returned list will have size:
     *
     * infoAnnotationOutputsList.size() * matchingRecords.size()
     *
     * See class-level comments for more details.
     *
     * @param infoAnnotationOutputsList
     * @param matchingRecords
     * @param bindingName
     * @return
     */
    private List<Map<String, Object>> explodeInfoAnnotationOutputsList(
            final List<Map<String, Object>> infoAnnotationOutputsList,
            final List<Map<String, String>> matchingRecords,
            final String bindingName) {
        //This is the return value. It represents the new list of lines in the output VCF file.
        final List<Map<String, Object>> newInfoAnnotationOutputsList = new LinkedList<Map<String, Object>>();
        //For each matching record, generate a new output line
        for(int i = 0; i < matchingRecords.size(); i++) {
            Map<String,String> annotationsForRecord = matchingRecords.get(i);
            annotationsForRecord = selectColumnsFromRecord(bindingName, annotationsForRecord); //use only those columns that the user specifically requested.
            //Add the annotations from this record to each output line.
            for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
                Map<String, Object> infoAnnotationOutputCopy = new HashMap<String, Object>(infoAnnotationOutput); //create a new copy of this line.
                infoAnnotationOutputCopy.putAll(annotationsForRecord); //Adds the column-value pairs from this record to this line.
                newInfoAnnotationOutputsList.add(infoAnnotationOutputCopy); //Add the line to the new list of lines.
            }
        }
        return newInfoAnnotationOutputsList;
    }
    /**
     * Takes a list of key-value pairs and returns a new Map containing only the columns which were requested by the user
     * via the -s arg. If there was no -s arg that referenced the given bindingName, all annotationsForRecord returned untouched.
     *
     * @param bindingName The binding name for a particular ROD input file.
     * @param annotationsForRecord The list of column_name -> value pairs for a particular record from the given input file.
     *
     * @return Map - see above.
     */
    private Map<String, String> selectColumnsFromRecord( String bindingName, Map<String, String> annotationsForRecord) {
        if(requestedColumnsMap == null || !requestedColumnsMap.containsKey(bindingName)) {
            return annotationsForRecord;
        }
        Set<String> requestedColumns = requestedColumnsMap.get(bindingName);
        Map<String, String> subsettedAnnotations = new HashMap<String, String>();
        for(Map.Entry<String, String> e : annotationsForRecord.entrySet() ) {
            if(requestedColumns.contains(e.getKey())) {
                subsettedAnnotations.put(e.getKey(), e.getValue());
            }
        }
        if(subsettedAnnotations.isEmpty()) {
            throw new StingException("Invalid -s argument for the '" + bindingName + "' input file. " +
                    "It caused all columns in the file to be rejected. Please check to make sure the -s column " +
                    "names match the column names in the '" + bindingName + "' file's HEADER line.");
        }
        return subsettedAnnotations;
    }
    /**
     * Determines how the engine will handle the case where multiple records in a ROD file
     * overlap a particular single locus. If explode is set to true, the output will be
     * one-to-many, so that each locus in the input VCF file could result in multiple
     * entries in the output VCF file. Otherwise, the output will be one-to-one, and
     * all multiple-match records will be collapsed into the single info field.
     * The collapsing is done by appending an _i to each key name (where 'i' is a
     * record counter).
     *
     * See class-level comments for more details.
     *
     * @param explode
     */
    public void setExplode(boolean explode) {
        this.explode = explode;
    }
    /**
     * Sets the columns that will be used for the info annotation field.
     * Column names should be of the form bindingName.columnName (eg. dbsnp.avHet).
     *
     * @param columns An array of strings where each string is a comma-separated list
     * of columnNames (eg ["dbsnp.avHet,dbsnp.valid", "file2.col1,file3.col1"] ).
     */
    public void setRequestedColumns(String[] columns) {
        if(columns == null) {
            throw new IllegalArgumentException("columns arg is null. Please check the -s command-line arg.");
        }
        //System.err.println("COLUMNS:  "+Arrays.asList(columns).toString());
        this.requestedColumnsMap = GenomicAnnotation.parseColumnsArg(columns);
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
 import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
@ -224,7 +225,8 @@ public class UnifiedGenotyperEngine {
                // first off, we want to use the *unfiltered* context for the annotations
                stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup());
-                call.vc = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
+                Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
                call.vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
            }
        }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotation.java
@ -0,0 +1,163 @@
 package org.broadinstitute.sting.playground.gatk.walkers.annotator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.Map.Entry;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
 import org.broadinstitute.sting.gatk.refdata.TabularROD;
 import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
 import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
 /**
 * TODO comment
 */
 public class GenomicAnnotation implements InfoFieldAnnotation {
    /**
     * For each ROD (aka. record) which overlaps the current locus, generates a
     * set of annotations of the form:
     *
     * thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A),
     *
     * These annotations are stored in a Map<String, String>.
     *
     * Since a single input file can have multiple records that overlap the current
     * locus (eg. dbSNP can have multiple entries for the same location), a different
     * Map<String, String> is created for each of these, resulting in a List<Map<String, String>>
     * for each input file.
     *
     * The return value of this method is a Map of the form:
     *     rodName1 -> List<Map<String, String>>
     *     rodName2 -> List<Map<String, String>>
     *     rodName3 -> List<Map<String, String>>
     *     ...
     *
     * The List values are guaranteed to have size > 0, and in most cases will have size == 1.
     */
    public Map<String, Object> annotate(final RefMetaDataTracker tracker,
            final ReferenceContext ref,
            final Map<String, StratifiedAlignmentContext> stratifiedContexts,
            final VariantContext vc) {
        final Map<String, Object> annotations = new HashMap<String, Object>();
        for(final GATKFeature gatkFeature : tracker.getAllRods())
        {
            final ReferenceOrderedDatum rod = (ReferenceOrderedDatum) gatkFeature.getUnderlyingObject();
            if(! (rod instanceof TabularROD) ) {
                continue; //GenericAnnotation only works with TabularRODs so that it can select individual columns
            }
            final String name = rod.getName();
            if(name.equals("variant") || name.equals("interval")) {
                continue;
            }
            final Map<String, String> annotationsForRecord = convertRecordToAnnotations( (TabularROD) rod );
            List<Map<String, String>> listOfMatchingRecords = (List<Map<String, String>>) annotations.get(name);
            if(listOfMatchingRecords == null) {
                listOfMatchingRecords = new LinkedList<Map<String,String>>();
                listOfMatchingRecords.add( annotationsForRecord );
                annotations.put(name, listOfMatchingRecords);
            } else {
                listOfMatchingRecords.add( annotationsForRecord );
            }
        }
        return annotations;
    }
    /**
     * Converts the ROD to a set of key-value pairs of the form:
     *   thisRodName.fieldName1=fieldValue, thisRodName.fieldName1=fieldValue
     *   (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A)
     *
     * @param rod A TabularROD corresponding to one record in one input file.
     *
     * @return The map of column-name -> value pairs.
     */
    private Map<String, String> convertRecordToAnnotations( final TabularROD rod ) {
        final String rodName = rod.getName(); //aka the rod binding
        final Map<String, String> result = new HashMap<String, String>();
        for(final Entry<String, String> entry : rod.entrySet()) {
            result.put( generateInfoFieldKey(rodName, entry.getKey()), entry.getValue());
        }
        return result;
    }
    public static String generateInfoFieldKey(String rodBindingName, String columnName ) {
        return rodBindingName + "." + columnName;
    }
    /**
     * Parses the columns arg and returns a Map of columns hashed by their binding name.
     * For example:
     *   The command line:
     *      -s dbSnp.valid,dbsnp.avHet -s refGene.txStart,refGene.txEnd
     *
     *   will be passed to this method as:
     *       ["dbSnp.valid,dbsnp.avHet", "refGene.txStart,refGene.txEnd"]
     *
     *   resulting in a return value of:
     *      {
     *       "dbSnp" -> "dbSnp.valid" ,
     *       "dbSnp" -> "dbsnp.avHet" ,
     *       "refGene" -> "refGene.txStart",
     *       "refGene" -> "refGene.txEnd"
     *      }
     *
     * @param columnsArg The -s command line arg value.
     *
     * @return Map representing a parsed version of this arg - see above.
     */
    public static Map<String, Set<String>> parseColumnsArg(String[] columnsArg) {
        Map<String, Set<String>> result = new HashMap<String, Set<String>>();
        for(String s : columnsArg) {
            for(String columnSpecifier : s.split(",") ) {
                String[] rodNameColumnName = columnSpecifier.split("\\.");
                if(rodNameColumnName.length != 2) {
                    throw new IllegalArgumentException("The following column specifier in the -s arg is invalid: [" + columnSpecifier + "]. It must be of the form 'bindingName.columnName'.");
                }
                String rodName = rodNameColumnName[0];
                //String columnName = rodNameColumnName[1];
                Set<String> requestedColumns = result.get(rodName);
                if(requestedColumns == null) {
                    requestedColumns = new HashSet<String>();
                    result.put(rodName, requestedColumns);
                }
                requestedColumns.add(columnSpecifier);
            }
        }
        return result;
    }
    public VCFInfoHeaderLine getDescription() {
        return new VCFInfoHeaderLine("GenericAnnotation", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position. ");
    }
    public String getKeyName() {
        return "GenericAnnotation";
    }
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java
@ -0,0 +1,174 @@
    package org.broadinstitute.sting.playground.gatk.walkers.annotator;
 import java.io.File;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
 import org.broadinstitute.sting.gatk.walkers.Allows;
 import org.broadinstitute.sting.gatk.walkers.By;
 import org.broadinstitute.sting.gatk.walkers.DataSource;
 import org.broadinstitute.sting.gatk.walkers.Reference;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.gatk.walkers.Window;
 import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.Pair;
 import org.broadinstitute.sting.utils.SampleUtils;
 import org.broadinstitute.sting.utils.cmdLine.Argument;
 import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
 import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
 import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
 import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
 /**
 * Annotates variant calls with context information.  Users can specify which of the available annotations to use.
 */
 //@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class))
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@Reference(window=@Window(start=-50,stop=50))
@By(DataSource.REFERENCE)
 public class GenomicAnnotator extends RodWalker<Integer, Integer> {
    @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
    protected File VCF_OUT;
    @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
    protected String sampleName = null;
    @Argument(fullName="select", shortName="s", doc="Select which columns to use for each ROD file. Column #s are 0-based. (eg. The following will select columns 5,6,2 from file1.txt and columns 3,7 from file2.txt: -B my-rod,table,/path/file1.txt -B my-rod2,table,/path/file2.txt -S my-rod={5,6,2} -S my-rod2={3,7})", required=false)
    protected String[] COLUMNS = {};
    @Argument(fullName="explode", shortName="exp", doc="If more than one record from the same file matches a particular locus, create multiple entries in the ouptut file - one for each match. WARNING: This could lead to combinatorial explotion if more than one file have more than one match at a particular locus.", required=false)
    protected Boolean EXPLODE = false;
    private VCFWriter vcfWriter;
    private HashMap<String, String> nonVCFsampleName = new HashMap<String, String>();
    private VariantAnnotatorEngine engine;
    /**
     * Prepare the output file and the list of available features.
     */
    public void initialize() {
        // get the list of all sample names from the various VCF input rods
        TreeSet<String> samples = new TreeSet<String>();
        SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
        // add the non-VCF sample from the command-line, if applicable
        if ( sampleName != null  ) {
            nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
            samples.add(sampleName.toUpperCase());
        }
        // if there are no valid samples, warn the user
        if ( samples.size() == 0 ) {
            logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
        }
        engine = new VariantAnnotatorEngine(getToolkit(), new String[] { }, new String[] { "GenomicAnnotation" });
        engine.setExplode( Boolean.TRUE.equals( EXPLODE ) );
        engine.setRequestedColumns(COLUMNS);
        // setup the header fields
        Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
        hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
        hInfo.add(new VCFHeaderLine("source", "Annotator"));
        hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
        hInfo.addAll(engine.getVCFAnnotationDescriptions());
        vcfWriter = new VCFWriter(VCF_OUT);
        VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
        vcfWriter.writeHeader(vcfHeader);
    }
    /**
     * Initialize the number of loci processed to zero.
     *
     * @return 0
     */
    public Integer reduceInit() { return 0; }
    /**
     * We want reads that span deletions
     *
     * @return true
     */
    public boolean includeReadsWithDeletionAtLoci() { return true; }
    /**
     * For each site of interest, annotate based on the requested annotation types
     *
     * @param tracker  the meta-data tracker
     * @param ref      the reference base
     * @param context  the context for the given locus
     * @return 1 if the locus was successfully processed, 0 if otherwise
     */
    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        if ( tracker == null )
            return 0;
        List<Object> rods = tracker.getReferenceMetaData("variant");
        // ignore places where we don't have a variant
        if ( rods.size() == 0 )
            return 0;
        Object variant = rods.get(0);
        VariantContext vc = VariantContextAdaptors.toVariantContext("variant", variant);
        if ( vc == null )
            return 0;
        // if the reference base is not ambiguous, we can annotate
        Collection<VariantContext> annotatedVCs = null;
        if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
            Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
            if ( stratifiedContexts != null ) {
                annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
            }
        }
        for(VariantContext annotatedVC : annotatedVCs) {
            vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
        }
        return 1;
    }
    /**
     * Increment the number of loci processed.
     *
     * @param value result of the map.
     * @param sum   accumulator for the reduce.
     * @return the new number of loci processed.
     */
    public Integer reduce(Integer value, Integer sum) {
        return sum + value;
    }
    /**
     * Tell the user the number of loci processed and close out the new variants file.
     *
     * @param result  the number of loci seen.
     */
    public void onTraversalDone(Integer result) {
        out.printf("Processed %d loci.\n", result);
        vcfWriter.close();
    }
 }