Preliminary commit of new VCFCombine, soon to be called CombineVariants (next commit) that support merging any number of VCF files via a general VC merge routine that support prioritization and merging of samples! It's now possible to merge the pilot1/2/3 call sets into a single (monster) VCF taking genotypes from pilot2, then pilot3, then pilot1 as needed.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3690 348d0f76-0448-11de-a6fe-93d51630548a
2010-06-30 20:13:03 +00:00 · 2010-06-30 20:13:03 +00:00 · b8d6a95e7a
parent a46e22ed13
commit b8d6a95e7a
9 changed files with 284 additions and 211 deletions
--- a/java/src/org/broad/tribble/vcf/NameAwareCodec.java
+++ b/java/src/org/broad/tribble/vcf/NameAwareCodec.java
@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2010, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broad.tribble.vcf;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: depristo
+ * Date: Jun 29, 2010
+ * Time: 3:48:47 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public interface NameAwareCodec {
+    public String getName();
+    public void setName(String name);
+}
--- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java
+++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java
@ -12,6 +12,11 @@ public class MutableGenotype extends Genotype {
        super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased());
    }

+    public MutableGenotype(String sampleName, Genotype parent) {
+        super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased());
+    }
+
+
    public MutableGenotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
        super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased);
    }
--- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java
+++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java
@ -27,6 +27,8 @@ import java.util.*;
 import org.apache.commons.jexl2.*;
 import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
 import org.broadinstitute.sting.utils.genotype.HardyWeinbergCalculation;
 import org.broad.tribble.vcf.VCFRecord;

@ -158,34 +160,59 @@ public class VariantContextUtils {
        return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount());
    }

+    public enum MergeType {
+        UNION_VARIANTS, INTERSECT_VARIANTS, UNIQUIFY_GENOTYPES, PRIORITIZE_GENOTYPES, UNSORTED_GENOTYPES
+    }

-    public static VariantContext simpleMerge(Set<VariantContext> VCs) {
-        if ( VCs == null || VCs.size() == 0 )
+    public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs) {
+        return simpleMerge(unsortedVCs, null, EnumSet.of(MergeType.INTERSECT_VARIANTS, MergeType.UNSORTED_GENOTYPES), false);
+    }
+
+
+    /**
+     * Merges VariantContexts into a single hybrid.  Takes genotypes for common samples in priority order, if provided.
+     * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
+     * the sample name
+     *
+     * @param unsortedVCs
+     * @param priorityListOfVCs
+     * @param mergeOptions
+     * @return
+     */
+    public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions, boolean annotateOrigin ) {
+        if ( unsortedVCs == null || unsortedVCs.size() == 0 )
            return null;

-        Iterator<VariantContext> iter = VCs.iterator();
+        if ( annotateOrigin && priorityListOfVCs == null )
+            throw new IllegalArgumentException("Cannot merge calls and annotate their origins with a complete priority list of VariantContexts");
+
+        List<VariantContext> VCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, mergeOptions);

        // establish the baseline info from the first VC
-        VariantContext first = iter.next();
+        VariantContext first = VCs.get(0);
        String name = first.getName();
        GenomeLoc loc = first.getLocation();
-        Set<Allele> alleles = new HashSet<Allele>(first.getAlleles());
-        Map<String, Genotype> genotypes = new HashMap<String, Genotype>(first.getGenotypes());
-        double negLog10PError = first.isVariant() ? first.getNegLog10PError() : -1;
-        Set<String> filters = new HashSet<String>(first.getFilters());
+
+        Set<Allele> alleles = new HashSet<Allele>();
+        Map<String, Genotype> genotypes = new HashMap<String, Genotype>();
+        double negLog10PError = -1;
+        Set<String> filters = new HashSet<String>();
        Map<String, String> attributes = new HashMap<String, String>();
        int depth = 0;
-        if ( first.hasAttribute(VCFRecord.DEPTH_KEY) )
-            depth = Integer.valueOf(first.getAttribute(VCFRecord.DEPTH_KEY).toString());
+
+        // filtering values
+        int nFiltered = 0;

        // cycle through and add info from the other VCs, making sure the loc/reference matches
-        while ( iter.hasNext() ) {
-            VariantContext vc = iter.next();
-            if ( !loc.equals(vc.getLocation()) || !first.getReference().equals(vc.getReference()) )
-                return null;
+        for ( VariantContext vc : VCs ) {
+            if ( !loc.equals(vc.getLocation()) ) // || !first.getReference().equals(vc.getReference()) )
+                throw new StingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString());
+
+            nFiltered += vc.isFiltered() ? 1 : 0;

            alleles.addAll(vc.getAlleles());
-            genotypes.putAll(vc.getGenotypes());
+
+            mergeGenotypes(genotypes, vc, mergeOptions.contains(MergeType.UNIQUIFY_GENOTYPES));

            negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1);

@ -194,8 +221,74 @@ public class VariantContextUtils {
                depth += Integer.valueOf(vc.getAttribute(VCFRecord.DEPTH_KEY).toString());
        }

+        // if at least one record was unfiltered and we want a union, clear all of the filters
+        if ( mergeOptions.contains(MergeType.UNION_VARIANTS) && nFiltered != VCs.size() )
+            filters.clear();
+
+        // we care about where the call came from
+        if ( annotateOrigin ) {
+            String setValue = "";
+            if ( nFiltered == 0 && VCs.size() == priorityListOfVCs.size() )                   // nothing was unfiltered
+                setValue = "Intersection";
+            else if ( nFiltered == VCs.size() )     // everything was filtered out
+                setValue = "FilteredInAll";
+            else {                                  // we are filtered in some subset
+                List<String> s = new ArrayList<String>();
+                for ( VariantContext vc : VCs )
+                    s.add( vc.isFiltered() ? "filterIn" + vc.getName() : vc.getName() );
+                setValue = Utils.join("-", s);
+            }
+            
+            attributes.put("set", setValue);
+        }
+
        if ( depth > 0 )
            attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth));
        return new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
    }
+
+    static class CompareByPriority implements Comparator<VariantContext> {
+        List<String> priorityListOfVCs;
+        public CompareByPriority(List<String> priorityListOfVCs) {
+            this.priorityListOfVCs = priorityListOfVCs;
+        }
+
+        private int getIndex(VariantContext vc) {
+            int i = priorityListOfVCs.indexOf(vc.getName());
+            if ( i == -1 ) throw new StingException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getName());
+            return i;
+        }
+
+        public int compare(VariantContext vc1, VariantContext vc2) {
+            return new Integer(getIndex(vc1)).compareTo(getIndex(vc2));
+        }
+    }
+
+    public static List<VariantContext> sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, EnumSet<MergeType> mergeOptions ) {
+        if ( mergeOptions.contains(MergeType.PRIORITIZE_GENOTYPES) && priorityListOfVCs == null )
+            throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list");
+
+        if ( priorityListOfVCs == null || mergeOptions.contains(MergeType.UNSORTED_GENOTYPES) )
+            return new ArrayList<VariantContext>(unsortedVCs);
+        else {
+            ArrayList<VariantContext> sorted = new ArrayList<VariantContext>(unsortedVCs);
+            Collections.sort(sorted, new CompareByPriority(priorityListOfVCs));
+            return sorted;
+        }
+    }
+
+    private static void mergeGenotypes(Map<String, Genotype> mergedGenotypes, VariantContext oneVC, boolean uniqifySamples) {
+        for ( Genotype g : oneVC.getGenotypes().values() ) {
+            String name = mergedSampleName(oneVC.getName(), g.getSampleName(), uniqifySamples);
+            if ( ! mergedGenotypes.containsKey(name) ) {
+                // only add if the name is new
+                Genotype newG = uniqifySamples ? g : new MutableGenotype(name, g);
+                mergedGenotypes.put(name, newG);
+            }
+        }
+    }
+
+    public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) {
+        return uniqify ? sampleName + "." + trackName : sampleName;
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java
@ -199,6 +199,16 @@ public class RefMetaDataTracker {
        return getAllVariantContexts(ref, null, null, false, false);
    }

+    /**
+     * Returns all of the variant contexts that start at the current location
+     * @param ref
+     * @param curLocation
+     * @return
+     */
+    public Collection<VariantContext> getAllVariantContexts(ReferenceContext ref, GenomeLoc curLocation) {
+        return getAllVariantContexts(ref, null, curLocation, true, false);
+    }
+
    /**
     * Converts all possible ROD tracks to VariantContexts objects.  If allowedTypes != null, then only
     * VariantContexts in the allow set of types will be returned.  If requireStartsHere is true, then curLocation
--- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java
@ -23,7 +23,7 @@ import java.util.*;
 * a feature codec for the VCF 4 specification.  Our aim is to read in the records and convert to VariantContext as
 * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
 */
-public class VCF4Codec implements FeatureCodec {
+public class VCF4Codec implements FeatureCodec, NameAwareCodec {

    // a variant context flag for original allele strings
    public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST";
@ -62,7 +62,7 @@ public class VCF4Codec implements FeatureCodec {
    private final boolean validateFromHeader = false;

    // we store a name to give to each of the variant contexts we emit
-    private String name = "Unkown";
+    private String name = "Unknown";

    private int lineNo = 0;

@ -224,12 +224,12 @@ public class VCF4Codec implements FeatureCodec {
                        String[] split = str.split(",");
                        for (String substring : split) {
                            VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
-                            objects.add(type.convert(substring));    
+                            objects.add(type != null ? type.convert(substring) : substring);    
                        }
                        value = objects;
                    } else {
                        VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
-                        value = type.convert(str);
+                        value = type != null ? type.convert(str) : str;
                    }
                    //System.out.printf("%s %s%n", key, value);
                } else {
@ -366,7 +366,7 @@ public class VCF4Codec implements FeatureCodec {
     * @return a variant context object
     */
    private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
-        try {
+//        try {
            // increment the line count
            lineNo++;
            
@ -403,9 +403,10 @@ public class VCF4Codec implements FeatureCodec {
            }

            return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
-        } catch ( Exception e ) {
-            throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e);
-        }
+//        } catch ( Exception e ) {
+//            // todo -- we need a local exception so that we can remember the location of the throw but also see the line
+//            throw new RuntimeException("Line " + lineNo + " generated parser exception " + e.getMessage() + "\nLine: " + Utils.join("\t", parts), e);
+//        }
    }

    /**
@ -545,7 +546,7 @@ public class VCF4Codec implements FeatureCodec {
        if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class);
        return this.header;
    }
-
+    
    /**
     * get the name of this codec
     * @return our set name
--- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java
@ -120,4 +120,8 @@ public abstract class RMDTrack {
    public <HeaderType> HeaderType getHeader(Class<HeaderType> clazz) throws ClassCastException {
        return null;
    }
+
+    public Object getHeader() {
+        return null;
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java
@ -32,6 +32,7 @@ import org.broad.tribble.index.Index;
 import org.broad.tribble.index.linear.LinearIndex;
 import org.broad.tribble.index.linear.LinearIndexCreator;
 import org.broad.tribble.readers.BasicFeatureReader;
+import org.broad.tribble.vcf.NameAwareCodec;
 import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException;
@ -95,9 +96,13 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
    @Override
    public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException {
        // return a feature reader track
-        Pair<BasicFeatureReader, SAMSequenceDictionary> pair = createFeatureReader(targetClass, inputFile);
+        Pair<BasicFeatureReader, SAMSequenceDictionary> pair = createFeatureReader(targetClass, name, inputFile);
        if (pair == null) throw new StingException("Unable to make the feature reader for input file " + inputFile);
-        return new TribbleTrack(targetClass, this.createByType(targetClass).getFeatureType(), name, inputFile, pair.first, pair.second);
+        return new TribbleTrack(targetClass, createCodec(targetClass, name).getFeatureType(), name, inputFile, pair.first, pair.second);
+    }
+
+    public Pair<BasicFeatureReader, SAMSequenceDictionary> createFeatureReader(Class targetClass, File inputFile) {
+        return createFeatureReader(targetClass, "anonymous", inputFile);
    }

    /**
@ -106,12 +111,12 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
     * @param inputFile the input file to create the track from (of the codec type)
     * @return the FeatureReader instance
     */
-    public Pair<BasicFeatureReader, SAMSequenceDictionary> createFeatureReader(Class targetClass, File inputFile) {
+    public Pair<BasicFeatureReader, SAMSequenceDictionary> createFeatureReader(Class targetClass, String name, File inputFile) {
        Pair<BasicFeatureReader, SAMSequenceDictionary> pair = null;
        if (inputFile.getAbsolutePath().endsWith(".gz"))
-            pair = createBasicFeatureReaderNoAssumedIndex(targetClass, inputFile);
+            pair = createBasicFeatureReaderNoAssumedIndex(targetClass, name, inputFile);
        else
-            pair = getLinearFeatureReader(targetClass, inputFile);
+            pair = getLinearFeatureReader(targetClass, name, inputFile);
        return pair;
    }

@ -124,27 +129,34 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
     * @param inputFile the file to load
     * @return a feature reader implementation
     */
-    private Pair<BasicFeatureReader, SAMSequenceDictionary> createBasicFeatureReaderNoAssumedIndex(Class targetClass, File inputFile) {
+    private Pair<BasicFeatureReader, SAMSequenceDictionary> createBasicFeatureReaderNoAssumedIndex(Class targetClass, String name, File inputFile) {
        // we might not know the index type, try loading with the default reader constructor
        logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file");
        try {
-            return new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(),this.createByType(targetClass)),null);
+            return new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(), createCodec(targetClass, name)),null);
        } catch (IOException e) {
            throw new StingException("Unable to create feature reader from file " + inputFile);
        }
    }

+    private FeatureCodec createCodec(Class targetClass, String name) {
+        FeatureCodec codex = this.createByType(targetClass);
+        if ( codex instanceof NameAwareCodec )
+            ((NameAwareCodec)codex).setName(name);
+        return codex;
+    }
+
    /**
     * create a linear feature reader, where we create the index ahead of time
     * @param targetClass the target class
     * @param inputFile the tribble file to parse
     * @return the input file as a FeatureReader
     */
-    private Pair<BasicFeatureReader, SAMSequenceDictionary> getLinearFeatureReader(Class targetClass, File inputFile) {
+    private Pair<BasicFeatureReader, SAMSequenceDictionary> getLinearFeatureReader(Class targetClass, String name, File inputFile) {
        Pair<BasicFeatureReader, SAMSequenceDictionary> reader;
        try {
-            Index index = loadIndex(inputFile, this.createByType(targetClass), true);
-            reader = new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(), index, this.createByType(targetClass)),index.getSequenceDictionary());
+            Index index = loadIndex(inputFile, createCodec(targetClass, name), true);
+            reader = new Pair<BasicFeatureReader, SAMSequenceDictionary>(new BasicFeatureReader(inputFile.getAbsolutePath(), index, createCodec(targetClass, name)),index.getSequenceDictionary());
        } catch (FileNotFoundException e) {
            throw new StingException("Unable to create reader with file " + inputFile, e);
        } catch (IOException e) {
--- a/java/src/org/broadinstitute/sting/gatk/walkers/vcf/VCFCombine.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/vcf/VCFCombine.java
@ -27,21 +27,18 @@ package org.broadinstitute.sting.gatk.walkers.vcf;

 import org.broad.tribble.vcf.VCFHeader;
 import org.broad.tribble.vcf.VCFHeaderLine;
-import org.broad.tribble.vcf.VCFRecord;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
-import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
 import org.broadinstitute.sting.gatk.walkers.Requires;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
-import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.SampleUtils;
 import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.utils.genotype.vcf.*;

-import java.io.File;
 import java.util.*;

 /**
@ -51,191 +48,86 @@ import java.util.*;
 *   priority list (if provided), emits a single record instance at every position represented in the rods.
 */
@Requires(value={})
-public class VCFCombine extends RodWalker<VCFRecord, VCFWriter> {
+public class VCFCombine extends RodWalker<Integer, Integer> {
    // the types of combinations we currently allow
-    public enum COMBINATION_TYPE {
-        UNION, MERGE
-    }
+    public enum ComboType { UNION, MERGE }
+    @Argument(fullName="combination_type", shortName="type", doc="combination type; MERGE are supported", required=true)
+    protected ComboType COMBO_TYPE;

-    @Argument(fullName="vcf_output_file", shortName="O", doc="VCF file to write results", required=false)
-    protected File OUTPUT_FILE = null;
-
-    @Argument(fullName="combination_type", shortName="type", doc="combination type; currently UNION and MERGE are supported", required=true)
-    protected COMBINATION_TYPE COMBO_TYPE;
-
-    @Argument(fullName="rod_priority_list", shortName="priority", doc="For the UNION combination type: a comma-separated string describing the priority ordering for the rods as far as which record gets emitted; a complete priority list MUST be provided", required=false)
+    @Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=true)
    protected String PRIORITY_STRING = null;

-    @Argument(fullName="annotateUnion", shortName="A", doc="For the UNION combination type: if provided, the output union VCF will contain venn information about where each call came from", required=false)
-    protected boolean annotateUnion = false;
-
    private VCFWriter vcfWriter = null;
-    private String[] priority = null;
-
-    // a map of rod name to uniquified sample name
-    private HashMap<Pair<String, String>, String> rodNamesToSampleNames;
+    private List<String> priority = null;
+    protected EnumSet<VariantContextUtils.MergeType> mergeOptions;

+    protected final static EnumSet<VariantContextUtils.MergeType> mergeTypeOptions = EnumSet.of(VariantContextUtils.MergeType.UNION_VARIANTS, VariantContextUtils.MergeType.UNIQUIFY_GENOTYPES);
+    protected final static EnumSet<VariantContextUtils.MergeType> unionTypeOptions = EnumSet.of(VariantContextUtils.MergeType.UNION_VARIANTS, VariantContextUtils.MergeType.PRIORITIZE_GENOTYPES);

    public void initialize() {

+        //Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
+        //hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
+
+        vcfWriter = new VCFWriter(out);
+        priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));
+
+        validateAnnotateUnionArguments(priority);
+        mergeOptions = COMBO_TYPE == ComboType.MERGE ? mergeTypeOptions : unionTypeOptions;
+        Set<String> samples = getSampleList(SampleUtils.getRodsWithVCFHeader(getToolkit(), null), mergeOptions);
+
        Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
-        metaData.add(new VCFHeaderLine("source", "VCF"));
-
-        Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
-        hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
-
-        if ( OUTPUT_FILE != null )
-            vcfWriter = new VCFWriter(OUTPUT_FILE);
-        else
-            vcfWriter = new VCFWriter(out);
-
-        if ( PRIORITY_STRING != null )
-            priority = PRIORITY_STRING.split(",");
-
-        Set<String> samples;
-        switch (COMBO_TYPE ) {
-            case MERGE:
-                samples = new TreeSet<String>();
-                rodNamesToSampleNames = new HashMap<Pair<String, String>, String>();
-                // get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap)
-                SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames);
-                break;
-            case UNION:
-                if ( annotateUnion ) {
-                    validateAnnotateUnionArguments(priority);
-                }
-
-                samples = SampleUtils.getUniqueSamplesFromRods(getToolkit());
-                break;
-            default:
-                throw new IllegalArgumentException("Unsupported combination type: " + COMBO_TYPE);
-        }
-
-        vcfWriter.writeHeader(new VCFHeader(hInfo, samples));
+        metaData.add(new VCFHeaderLine("source", "VCFCombine"));
+        vcfWriter.writeHeader(new VCFHeader(metaData, samples));
    }

-    private void validateAnnotateUnionArguments(String[] priority) {
-        Set<RMDTrack> rods = VCFUtils.getRodVCFs(getToolkit());
-        if ( priority == null || rods.size() != priority.length ) {
-            throw new StingException("A complete priority list must be provided when annotateUnion is provided");
-        }
-        if ( priority.length != 2 ) {
-            throw new StingException("When annotateUnion is provided only 2 VCF files can be merged");
-        }
-
-        for ( String p : priority ) {
-            boolean good = false;
-            for ( RMDTrack data : rods ) {
-                if ( p.equals(data.getName()) )
-                    good = true;
+    private Set<String> getSampleList(Map<String, VCFHeader> headers, EnumSet<VariantContextUtils.MergeType> mergeOptions ) {
+        Set<String> samples = new HashSet<String>();
+        for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
+            VCFHeader header = val.getValue();
+            for ( String sample : header.getGenotypeSamples() ) {
+                samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOptions.contains(VariantContextUtils.MergeType.UNIQUIFY_GENOTYPES)));
            }
-            if ( ! good ) throw new StingException("Priority item not provided as a ROD! " + p);
        }
+
+        return samples;
    }

-    public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+    private void validateAnnotateUnionArguments(List<String> priority) {
+        Set<String> rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null);
+        if ( priority == null || rodNames.size() != priority.size() )
+            throw new StingException("A complete priority list must be provided when annotateUnion is provided");
+
+        if ( ! rodNames.containsAll(rodNames) )
+            throw new StingException("Not all priority elements provided as input RODs: " + PRIORITY_STRING);
+    }
+
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        if ( tracker == null ) // RodWalkers can make funky map calls
-            return null;
+            return 0;

        // get all of the vcf rods at this locus
-        Map<VCFRecord, String> vcfRods = new LinkedHashMap<VCFRecord,String>();
-        Iterator<GATKFeature> rods = tracker.getAllRods().iterator();
-        while (rods.hasNext()) {
-            GATKFeature feat = rods.next();
-            Object rod = feat.getUnderlyingObject();
-            if ( rod instanceof VCFRecord )
-                vcfRods.put((VCFRecord)rod,feat.getName());
-        }
+        Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
+        VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, mergeOptions, true);
+        if ( mergedVC != null ) // only operate at the start of events
+            if ( ! mergedVC.isMixed() ) // todo remove restriction when VCF4 writer is fixed
+                vcfWriter.add(mergedVC, ref.getBases());
+            else
+                logger.info(String.format("Ignoring complex event: " + mergedVC));

-        if ( vcfRods.size() == 0 )
-            return null;
-
-        VCFRecord record = null;
-        switch (COMBO_TYPE ) {
-            case MERGE:
-                record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames);
-                break;
-            case UNION:
-                record = vcfUnion(vcfRods);
-                break;
-            default:
-                break;
-        }
-
-        return record;
+        return vcs.isEmpty() ? 0 : 1;
    }

-    public VCFWriter reduceInit() {
-        return vcfWriter;
+    public Integer reduceInit() {
+        return 0;
    }

-    private VCFRecord vcfUnion(Map<VCFRecord, String> rods) {
-        if ( priority == null )
-            return rods.keySet().iterator().next();
-
-        if ( annotateUnion ) {
-            Map<String, VCFRecord> rodMap = new HashMap<String, VCFRecord>();
-            for ( VCFRecord vcf : rods.keySet() ) {
-                rodMap.put(rods.get(vcf),vcf);
-            }
-
-            String priority1 = priority[0];
-            String priority2 = priority[1];
-            VCFRecord vcf1 = rodMap.containsKey(priority1) ? rodMap.get(priority1) : null;
-            VCFRecord vcf2 = rodMap.containsKey(priority2) ? rodMap.get(priority2) : null;
-
-            // for simplicity, we are setting set and call for vcf1
-            String set = priority1;
-            VCFRecord call = vcf1;
-
-            if ( vcf1 == null ) {
-                if ( vcf2 == null )
-                    //return null;
-                    throw new StingException("BUG: VCF1 and VCF2 are both null!");
-                else {
-                    set = priority2;
-                    call = vcf2;
-                }
-            } else if ( vcf1.isFiltered() ) {
-                if ( vcf2 != null ) {
-                    if ( vcf2.isFiltered() ) {
-                        set = "filteredInBoth";
-                    } else {
-                        set = priority2 + "-filteredInOther";
-                        call = vcf2;
-                    }
-                }
-            } else { // good call
-                if ( vcf2 != null ) {
-                    if ( vcf2.isFiltered() )
-                        set = priority1 + "-filteredInOther";
-                    else
-                        set = "Intersection";
-                }
-            }
-
-            call.addInfoField("set", set);
-            return call;
-        } else {
-            for ( String rodname : priority ) {
-                for ( VCFRecord rod : rods.keySet() ) {
-                    if ( rods.get(rod).equals(rodname) )
-                        return rod;
-                }
-            }
-        }
-
-        return null;
+    public Integer reduce(Integer counter, Integer sum) {
+        return counter + sum;
    }

-    public VCFWriter reduce(VCFRecord record, VCFWriter writer) {
-        if ( record != null )
-            writer.addRecord(record);
-        return writer;
-    }
-
-    public void onTraversalDone(VCFWriter writer) {
-        if ( writer != null )
-            writer.close();
+    public void onTraversalDone(Integer sum) {
+        if ( vcfWriter != null )
+            vcfWriter.close();
    }
 }
--- a/java/src/org/broadinstitute/sting/utils/SampleUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/SampleUtils.java
@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils;
 import net.sf.samtools.SAMFileHeader;
 import net.sf.samtools.SAMReadGroupRecord;
 import org.broad.tribble.vcf.VCFHeader;
-import org.broad.tribble.vcf.VCFRecord;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
@ -87,6 +86,19 @@ public class SampleUtils {
    public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
        Set<String> samples = new TreeSet<String>();

+        for ( VCFHeader header : getRodsWithVCFHeader(toolkit, rodNames).values() )
+            samples.addAll(header.getGenotypeSamples());
+
+        return samples;
+    }
+
+    public static Set<String> getRodsNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
+        return getRodsWithVCFHeader(toolkit, rodNames).keySet();
+    }
+
+    public static Map<String, VCFHeader> getRodsWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
+        Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
+
        // iterate to get all of the sample names
        List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
        for ( ReferenceOrderedDataSource source : dataSources ) {
@ -95,13 +107,23 @@ public class SampleUtils {
                continue;

            RMDTrack rod = source.getReferenceOrderedData();
-            if ( rod.getRecordType().equals(VCFRecord.class) )
-                samples.addAll(rod.getHeader(VCFHeader.class).getGenotypeSamples());
+            if ( containsVCFHeader(rod) )
+                data.put(rod.getName(), rod.getHeader(VCFHeader.class));
        }

-        return samples;
+        return data;
    }

+    // todo -- remove when we can actually just get the header itself from tribble
+    private static boolean containsVCFHeader(RMDTrack rod) {
+        try {
+            return rod.getHeader(VCFHeader.class) != null;
+        } catch ( ClassCastException e ) {
+            return false;
+        }
+    }
+
+
    /**
     * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
     * (e.g. sampleX.1, sampleX.2, ...)
@ -118,14 +140,11 @@ public class SampleUtils {
        HashMap<String, Integer> sampleOverlapMap = new HashMap<String, Integer>();

        // iterate to get all of the sample names
-        List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
-        for ( ReferenceOrderedDataSource source : dataSources ) {
-            RMDTrack rod = source.getReferenceOrderedData();
-            if ( rod.getRecordType().equals(VCFRecord.class) ) {
-                Set<String> vcfSamples = rod.getHeader(VCFHeader.class).getGenotypeSamples();
-                for ( String sample : vcfSamples )
-                    addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName());
-            }
+
+        for ( Map.Entry<String, VCFHeader> pair : getRodsWithVCFHeader(toolkit, null).entrySet() ) {
+            Set<String> vcfSamples = pair.getValue().getGenotypeSamples();
+            for ( String sample : vcfSamples )
+                addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey());
        }
    }