Totally revamped the MNP annotation and put it in its own walker: AnnotateMNPsWalker

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4730 348d0f76-0448-11de-a6fe-93d51630548a
2010-11-24 18:05:10 +00:00 · 2010-11-24 18:05:10 +00:00 · ca70ed611c
parent 6b70cde0b9
commit ca70ed611c
5 changed files with 961 additions and 159 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java
@ -0,0 +1,827 @@
+/*
+ * Copyright (c) 2010, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.phasing;
+
+import org.broad.tribble.util.variantcontext.Allele;
+import org.broad.tribble.util.variantcontext.VariantContext;
+import org.broad.tribble.vcf.*;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcid;
+import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcidTable;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.vcf.VCFUtils;
+
+import java.util.*;
+
+import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods;
+
+
+/**
+ * Walks along all variant ROD loci, and dynamically annotates alleles at MNP records.
+ */
+@Allows(value = {DataSource.REFERENCE})
+@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = AnnotateMNPsWalker.REFSEQ_ROD_NAME, type = AnnotatorInputTableFeature.class), @RMD(name = AnnotateMNPsWalker.VARIANT_ROD_NAME, type = ReferenceOrderedDatum.class)})
+
+public class AnnotateMNPsWalker extends RodWalker<Integer, Integer> {
+
+    @Output(doc = "File to which variants should be written", required = true)
+    protected VCFWriter writer = null;
+    private ManualSortingVCFWriter sortingWriter = null;
+
+    private LinkedList<String> rodNames = null;
+    private GenomeLocParser locParser = null;
+    private TreeMap<GenomeLoc, Set<GenomeLoc>> MNPstartToStops = null; // Must be TreeMap sorted by START sites!
+
+    public final static String REFSEQ_ROD_NAME = "refseq";
+    public final static String VARIANT_ROD_NAME = "variant";
+
+    private LocusToFeatures locusToRefSeqFeatures = null;
+
+
+    protected final static String MNP_ANNOTATION_KEY_PREFIX = "MNP.refseq.";
+
+    protected final static String REFSEQ_NAME = "name";
+    protected final static String REFSEQ_NAME2 = "name2";
+
+    protected final static String REFSEQ_POSITION_TYPE = "positionType";
+    protected final static String REFSEQ_CDS = "CDS";
+
+    protected final static String REFSEQ_STRAND = "transcriptStrand";
+    protected final static String REFSEQ_POS_STRAND = "+";
+    protected final static String REFSEQ_NEG_STRAND = "-";
+
+    protected final static String REFSEQ_CODON_COORD = "codonCoord";
+    protected final static String REFSEQ_CODING_FRAME = "frame";
+
+    protected final static String REFSEQ_REF_CODON = "referenceCodon";
+    protected final static String REFSEQ_REF_AA = "referenceAA";
+
+    protected final static String REFSEQ_ALT_BASE = "haplotypeAlternate";
+
+    protected final static String REFSEQ_VARIANT_CODON = "variantCodon";
+    protected final static String REFSEQ_VARIANT_AA = "variantAA";
+    protected final static String REFSEQ_CHANGES_AA = "changesAA";
+    protected final static String REFSEQ_FUNCTIONAL_CLASS = "functionalClass";
+    protected final static String REFSEQ_PROTEIN_COORD_DESCRIPTION = "proteinCoordStr";
+
+    protected final static String REFSEQ_CODING_ANNOTATIONS = "codingVariants";
+
+    public void initialize() {
+        rodNames = new LinkedList<String>();
+        rodNames.add(VARIANT_ROD_NAME);
+
+        locParser = getToolkit().getGenomeLocParser();
+        MNPstartToStops = new TreeMap<GenomeLoc, Set<GenomeLoc>>(); // sorted by start sites
+
+        initializeVcfWriter();
+
+        locusToRefSeqFeatures = new LocusToFeatures();
+    }
+
+    private void initializeVcfWriter() {
+        sortingWriter = new ManualSortingVCFWriter(writer);
+        writer = sortingWriter;
+
+        // setup the header fields:
+        Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
+        hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
+        hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
+
+        Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames);
+        writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples())));
+    }
+
+    public boolean generateExtendedEvents() {
+        return false;
+    }
+
+    public Integer reduceInit() {
+        return 0;
+    }
+
+    /**
+     * For each site of interest, annotate it if it's a MNP.
+     *
+     * @param tracker the meta-data tracker
+     * @param ref     the reference base
+     * @param context the context for the given locus
+     * @return count of MNPs observed
+     */
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        if (tracker == null)
+            return null;
+
+        int numMNPsObserved = 0;
+        GenomeLoc curLocus = ref.getLocus();
+        clearOldLocusFeatures(curLocus);
+
+        boolean requireStartHere = false; // see EVERY site of the MNP
+        boolean takeFirstOnly = false; // take as many entries as the VCF file has
+        for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
+            if (vc.getType() == VariantContext.Type.MNP) {
+                GenomeLoc MNPloc = VariantContextUtils.getLocation(locParser, vc);
+                boolean atStartOfMNP = curLocus.getStart() == MNPloc.getStart();
+                boolean atEndOfMNP = curLocus.getStart() == MNPloc.getStop();
+                logger.debug("Observed MNP at " + MNPloc);
+
+                if (isChrM(vc)) {
+                    if (atStartOfMNP) {
+                        logger.warn("Skipping mitochondrial MNP at " + MNPloc + " due to complexity of coding table [need to know if first codon, etc.]...");
+                        writeVCF(vc);
+                    }
+                    continue;
+                }
+
+                GenomeLoc stopLoc = locParser.createGenomeLoc(curLocus.getContig(), MNPloc.getStop());
+                final List<Object> refSeqRODs = tracker.getReferenceMetaData(REFSEQ_ROD_NAME);
+                for (Object refSeqObject : refSeqRODs) {
+                    AnnotatorInputTableFeature refSeqAnnotation = (AnnotatorInputTableFeature) refSeqObject;
+                    locusToRefSeqFeatures.putLocusFeatures(curLocus, refSeqAnnotation, stopLoc);
+                }
+
+                if (atStartOfMNP) { // MNP is starting here, so register that we're waiting for it
+                    Set<GenomeLoc> stopLocs = MNPstartToStops.get(curLocus);
+                    if (stopLocs == null) {
+                        stopLocs = new HashSet<GenomeLoc>();
+                        MNPstartToStops.put(curLocus, stopLocs);
+                    }
+                    stopLocs.add(stopLoc);
+                }
+
+                if (atEndOfMNP) {
+                    numMNPsObserved++; // only count a MNP at its stop site
+                    logger.debug("Observed end of MNP at " + curLocus);
+                    logger.debug("Current list of per-locus features\n" + locusToRefSeqFeatures);
+
+                    Map<String, Object> MNPannotations = annotateMNP(vc);
+                    MNPannotations.putAll(vc.getAttributes());
+                    vc = VariantContext.modifyAttributes(vc, MNPannotations);
+                    writeVCF(vc);
+
+                    GenomeLoc startLoc = locParser.createGenomeLoc(curLocus.getContig(), MNPloc.getStart());
+                    Set<GenomeLoc> stopLocs = MNPstartToStops.get(startLoc);
+                    if (stopLocs != null) { // otherwise, just removed stopLocs due to another MNP that has the same (start, stop)
+                        stopLocs.remove(stopLoc);
+                        if (stopLocs.isEmpty()) // no longer waiting for startLoc
+                            MNPstartToStops.remove(startLoc);
+                    }
+                }
+            }
+            else {
+                writeVCF(vc);
+            }
+        }
+
+        Integer mostUpstreamWritableLoc = null;
+        if (!MNPstartToStops.isEmpty()) {
+            GenomeLoc waitingForLoc = MNPstartToStops.entrySet().iterator().next().getKey();
+            mostUpstreamWritableLoc = waitingForLoc.getStart() - 1;
+        }
+        sortingWriter.setmostUpstreamWritableLocus(mostUpstreamWritableLoc);
+
+        return numMNPsObserved;
+    }
+
+    private static boolean isChrM(final VariantContext vc) {
+        return vc.getChr().equals("chrM") || vc.getChr().equals("MT");
+    }
+
+    private Map<String, Object> annotateMNP(VariantContext vc) {
+        Map<String, Object> annotations = new HashMap<String, Object>();
+
+        RefSeqNameToFeatures nameToPositionalFeatures = new RefSeqNameToFeatures(vc);
+        MNPannotationKeyBuilder kb = new MNPannotationKeyBuilder(nameToPositionalFeatures);
+
+        for (Map.Entry<String, RefSeqFeatureList> nameToFeatureEntry : nameToPositionalFeatures.entrySet()) {
+            String featureName = nameToFeatureEntry.getKey();
+            RefSeqFeatureList feature = nameToFeatureEntry.getValue();
+            CodonAnnotationsForAltAlleles codonAnnotationsForAlleles = new CodonAnnotationsForAltAlleles(vc, feature);
+
+            annotations.put(kb.getKey(REFSEQ_CODING_ANNOTATIONS), codonAnnotationsForAlleles.toString());
+            annotations.put(kb.getKey(REFSEQ_NAME), featureName);
+            annotations.put(kb.getKey(REFSEQ_NAME2), feature.name2);
+            annotations.put(kb.getKey(REFSEQ_POSITION_TYPE), REFSEQ_CDS);
+            annotations.put(kb.getKey(REFSEQ_STRAND), (feature.positiveStrand ? REFSEQ_POS_STRAND : REFSEQ_NEG_STRAND));
+            annotations.put(kb.getKey(REFSEQ_CODON_COORD), feature.getCodonCoordString());
+
+            kb.incrementFeatureIndex();
+        }
+
+        return annotations;
+    }
+
+    private static class MNPannotationKeyBuilder {
+        private int featureIndex;
+        private boolean multipleEntries;
+
+        public MNPannotationKeyBuilder(RefSeqNameToFeatures nameToPositionalFeatures) {
+            this.featureIndex = 1;
+            this.multipleEntries = nameToPositionalFeatures.nameToFeatures.size() > 1;
+        }
+
+        public void incrementFeatureIndex() {
+            featureIndex++;
+        }
+
+        public String getKey(String type) {
+            String annotationKey = MNP_ANNOTATION_KEY_PREFIX + type;
+            if (multipleEntries)
+                annotationKey += "_" + featureIndex;
+            return annotationKey;
+        }
+    }
+
+    private static byte[] ByteArrayToPrimitive(Byte[] nonNullArray) {
+        byte[] primArray = new byte[nonNullArray.length];
+
+        for (int i = 0; i < nonNullArray.length; i++) {
+            if (nonNullArray[i] == null)
+                throw new ReviewedStingException("nonNullArray[i] == null");
+            primArray[i] = nonNullArray[i];
+        }
+
+        return primArray;
+    }
+
+    private void clearOldLocusFeatures(GenomeLoc curLoc) {
+        Iterator<Map.Entry<GenomeLoc, PositionalRefSeqFeatures>> locusFeaturesIt = locusToRefSeqFeatures.entrySet().iterator();
+        while (locusFeaturesIt.hasNext()) {
+            Map.Entry<GenomeLoc, PositionalRefSeqFeatures> locusFeaturesEntry = locusFeaturesIt.next();
+            if (curLoc.isPast(locusFeaturesEntry.getValue().getFurthestLocusUsingFeatures()))
+                locusFeaturesIt.remove();
+        }
+    }
+
+    public Integer reduce(Integer count, Integer total) {
+        if (count != null)
+            total = total + count;
+
+        return total;
+    }
+
+    /**
+     * @param result the number of MNPs processed.
+     */
+    public void onTraversalDone(Integer result) {
+        System.out.println("Number of MNPs observed: " + result);
+        writer.close();
+    }
+
+    private void writeVCF(VariantContext vc) {
+        byte refBase;
+        if (!vc.isIndel()) {
+            Allele varAllele = vc.getReference();
+            refBase = SNPallelePair.getSingleBase(varAllele);
+        }
+        else {
+            refBase = vc.getReferenceBaseForIndel();
+        }
+
+        writer.add(vc, refBase);
+    }
+
+    /*
+     Inner classes:
+     */
+
+    // Maps: RefSeq entry name -> features for ALL positions of a particular VariantContext MNP:
+
+    private class RefSeqNameToFeatures {
+        private Map<String, RefSeqFeatureList> nameToFeatures;
+
+        public RefSeqNameToFeatures(VariantContext vc) {
+            this.nameToFeatures = new HashMap<String, RefSeqFeatureList>();
+
+            int MNPstart = vc.getStart();
+            int MNPstop = vc.getEnd();
+            int MNPlength = MNPstop - MNPstart + 1;
+
+            for (int i = 0; i < MNPlength; i++) {
+                int genomicPosition = MNPstart + i;
+                GenomeLoc posLoc = locParser.createGenomeLoc(vc.getChr(), genomicPosition);
+
+                PositionalRefSeqFeatures locFeatures = locusToRefSeqFeatures.getLocusFeatures(posLoc);
+                if (locFeatures == null) // no features for posLoc
+                    continue;
+
+                for (Map.Entry<String, PositionalRefSeqFeature> nameToFeatureEntry : locFeatures.entrySet()) {
+                    String name = nameToFeatureEntry.getKey();
+                    PositionalRefSeqFeature posFeature = nameToFeatureEntry.getValue();
+
+                    RefSeqFeatureList featureList = nameToFeatures.get(name);
+                    if (featureList == null) {
+                        featureList = new RefSeqFeatureList(MNPlength);
+                        nameToFeatures.put(name, featureList);
+                    }
+                    featureList.updateFeatureAtPosition(i, posFeature);
+                }
+            }
+        }
+
+        public Set<Map.Entry<String, RefSeqFeatureList>> entrySet() {
+            return nameToFeatures.entrySet();
+        }
+    }
+
+    // For a particular RefSeq entry, contains the features for ALL positions of a particular VariantContext MNP
+
+    private static class RefSeqFeatureList {
+        private final static String CODON_FRAME_START = "(";
+        private final static String CODON_FRAME_END = ")";
+        private final static String CODON_DELIM = "|";
+
+        private CodingRefSeqFeature[] refSeqFeatures;
+        private String name2;
+        private Boolean positiveStrand;
+
+        private Map<Integer, List<Integer>> codonToIndices; // Map of: codon index -> MNP indices that refer to codon
+
+        public RefSeqFeatureList(int MNPlength) {
+            this.refSeqFeatures = new CodingRefSeqFeature[MNPlength];
+            for (int i = 0; i < MNPlength; i++)
+                this.refSeqFeatures[i] = null;
+
+            this.name2 = null;
+            this.positiveStrand = null;
+            this.codonToIndices = new TreeMap<Integer, List<Integer>>();
+        }
+
+        public void updateFeatureAtPosition(int index, PositionalRefSeqFeature feature) {
+            if (name2 == null) {
+                name2 = feature.name2;
+                positiveStrand = feature.positiveStrand;
+            }
+            else if (!name2.equals(feature.name2) || positiveStrand != feature.positiveStrand) {
+                throw new UserException("Inconsistency between previous RefSeq entry and: " + feature);
+            }
+
+            CodingRefSeqFeature crsf = new CodingRefSeqFeature(feature);
+            refSeqFeatures[index] = crsf;
+
+            List<Integer> indicesWithCodon = codonToIndices.get(crsf.codonCoord);
+            if (indicesWithCodon == null) {
+                indicesWithCodon = new LinkedList<Integer>();
+                codonToIndices.put(crsf.codonCoord, indicesWithCodon);
+            }
+            indicesWithCodon.add(index);
+        }
+
+        public Set<Map.Entry<Integer, List<Integer>>> codonIndicesEntrySet() {
+            return codonToIndices.entrySet();
+        }
+
+        public String getCodonCoordString() {
+            StringBuilder sb = new StringBuilder();
+
+            for (int i = 0; i < refSeqFeatures.length; i++) {
+                CodingRefSeqFeature crsf = refSeqFeatures[i];
+                if (crsf != null)
+                    sb.append(crsf.codonCoord).append(CODON_FRAME_START).append(crsf.codingFrame).append(CODON_FRAME_END);
+                if (i < refSeqFeatures.length - 1)
+                    sb.append(CODON_DELIM);
+            }
+
+            return sb.toString();
+        }
+    }
+
+    private static class CodingRefSeqFeature {
+        protected int codonCoord;
+        protected int codingFrame;
+        protected String referenceCodon;
+        protected String referenceAA;
+
+        public CodingRefSeqFeature(PositionalRefSeqFeature feature) {
+            this.codonCoord = feature.codonCoord;
+            this.codingFrame = feature.codingFrame;
+            this.referenceCodon = feature.referenceCodon.toUpperCase();
+            this.referenceAA = feature.referenceAA;
+        }
+    }
+
+    private static class CodonAnnotationsForAltAlleles {
+        protected final static int MIN_CODON_INDEX = 0;
+        protected final static int NUM_CODON_INDICES = 3;
+        private final static String CODON_ANNOTATION_DELIM = ",";
+
+        private List<SingleCodonAnnotationsForAlleles> alleleAnnotations;
+
+        public CodonAnnotationsForAltAlleles(VariantContext vc, RefSeqFeatureList feature) {
+            this.alleleAnnotations = new LinkedList<SingleCodonAnnotationsForAlleles>();
+
+            int MNPstart = vc.getStart();
+            int MNPstop = vc.getEnd();
+            int MNPlength = MNPstop - MNPstart + 1;
+
+            for (Map.Entry<Integer, List<Integer>> codonToIndicesEntry : feature.codonIndicesEntrySet()) {
+                int codonIndex = codonToIndicesEntry.getKey();
+                List<Integer> indices = codonToIndicesEntry.getValue();
+                if (indices.isEmpty())
+                    throw new ReviewedStingException("indices should not exist if it's empty!");
+
+                for (int index : indices) {
+                    int frame = feature.refSeqFeatures[index].codingFrame;
+                    if (feature.refSeqFeatures[index].codonCoord != codonIndex)
+                        throw new ReviewedStingException("LOGICAL ERROR: feature.refSeqFeatures[index].codonCoord != codonIndex");
+                    if (frame < MIN_CODON_INDEX || frame >= NUM_CODON_INDICES)
+                        throw new UserException("RefSeq codon frame not one of {0,1,2}");
+                }
+                CodingRefSeqFeature firstFeatureForCodon = feature.refSeqFeatures[indices.get(0)];
+                String refCodon = firstFeatureForCodon.referenceCodon;
+
+                SingleCodonAnnotationsForAlleles codonAnnotation = new SingleCodonAnnotationsForAlleles(codonIndex, vc.getAlternateAlleles(), MNPlength, refCodon, firstFeatureForCodon, indices, feature);
+                alleleAnnotations.add(codonAnnotation);
+            }
+        }
+
+        public String toString() {
+            StringBuilder sb = new StringBuilder();
+
+            int index = 0;
+            for (SingleCodonAnnotationsForAlleles codonToAlleles : alleleAnnotations) {
+                sb.append(codonToAlleles);
+                if (index < alleleAnnotations.size() - 1)
+                    sb.append(CODON_ANNOTATION_DELIM);
+                index++;
+            }
+
+            return sb.toString();
+        }
+    }
+
+    private static class SingleCodonAnnotationsForAlleles {
+        private final static String CODON_MAP_SYMBOL = "->";
+        private final static String CODON_ANNOTATION_START = "[";
+        private final static String CODON_ANNOTATION_END = "]";
+        private final static String REF_CODON_INFO_DELIM = "|";
+        private final static String ALLELE_ANNOTATION_DELIM = ",";
+        private final static String ASSIGNMENT = ":";
+
+        private int codonIndex;
+        private String refCodon;
+        private String refAA;
+
+        private List<SingleCodonAnnotationsForAllele> annotationsForAlleles;
+
+        public SingleCodonAnnotationsForAlleles(int codonIndex, Collection<Allele> altAlleles, int MNPlength, String refCodon, CodingRefSeqFeature firstFeatureForCodon, List<Integer> indices, RefSeqFeatureList feature) {
+            if (refCodon.length() != CodonAnnotationsForAltAlleles.NUM_CODON_INDICES)
+                throw new UserException("RefSeq reference codon " + refCodon + " is not of length " + CodonAnnotationsForAltAlleles.NUM_CODON_INDICES);
+
+            AminoAcid refAA = AminoAcidTable.getEukaryoticAA(refCodon);
+            if (!refAA.getCode().equals(firstFeatureForCodon.referenceAA))
+                throw new UserException("RefSeq: translated reference codon= " + refAA + " != " + firstFeatureForCodon.referenceAA + " = reference AA");
+
+            this.codonIndex = codonIndex;
+            this.refCodon = refCodon;
+            this.refAA = refAA.getCode();
+            this.annotationsForAlleles = new LinkedList<SingleCodonAnnotationsForAllele>();
+
+            for (Allele altAllele : altAlleles) {
+                if (altAllele.length() != MNPlength)
+                    throw new ReviewedStingException("length(altAllele) != length(MNP)");
+                byte[] altBases = altAllele.getBases();
+
+                Byte[] variantCodonArr = new Byte[CodonAnnotationsForAltAlleles.NUM_CODON_INDICES];
+                for (int i = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; i < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; i++)
+                    variantCodonArr[i] = null;
+
+                for (int index : indices) {
+                    int frame = feature.refSeqFeatures[index].codingFrame;
+                    if (variantCodonArr[frame] != null)
+                        throw new UserException("RefSeq assigns codon " + codonIndex + " twice at same frame: " + frame);
+
+                    byte base = altBases[index];
+                    if (!feature.positiveStrand) // negative strand codon
+                        base = BaseUtils.simpleComplement(base);
+
+                    variantCodonArr[frame] = base;
+                }
+
+                /* For missing frames, there MUST exist AT LEAST one index that refers to this codon,
+                  so use it to derive the missing bases [ALREADY complemented if on the negative strand]:
+                */
+                for (int frame = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; frame < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; frame++) {
+                    if (variantCodonArr[frame] == null)
+                        variantCodonArr[frame] = (byte) refCodon.charAt(frame);
+                }
+                String variantCodon = new String(ByteArrayToPrimitive(variantCodonArr)).toUpperCase();
+
+                SingleCodonAnnotationsForAllele alleleAnnotation = new SingleCodonAnnotationsForAllele(variantCodon, refCodon, refAA, codonIndex);
+                annotationsForAlleles.add(alleleAnnotation);
+            }
+        }
+
+        public String toString() {
+            StringBuilder sb = new StringBuilder();
+
+            sb.append(codonIndex).append(CODON_MAP_SYMBOL).append(CODON_ANNOTATION_START);
+            sb.append(REFSEQ_REF_CODON).append(ASSIGNMENT).append(refCodon).append(REF_CODON_INFO_DELIM);
+            sb.append(REFSEQ_REF_AA).append(ASSIGNMENT).append(refAA).append(REF_CODON_INFO_DELIM);
+
+            int index = 0;
+            for (SingleCodonAnnotationsForAllele annotation : annotationsForAlleles) {
+                sb.append(annotation);
+                if (index < annotationsForAlleles.size() - 1)
+                    sb.append(ALLELE_ANNOTATION_DELIM);
+                index++;
+            }
+            sb.append(CODON_ANNOTATION_END);
+
+            return sb.toString();
+        }
+    }
+
+    private static class SingleCodonAnnotationsForAllele {
+        private final static String ALLELE_START = "{";
+        private final static String ALLELE_END = "}";
+        private final static String CODON_INFO_DELIM = "|";
+        private final static String ASSIGNMENT = ":";
+
+        private String variantCodon;
+        private String variantAA;
+        private boolean changesAA;
+        private String functionalClass;
+        private String proteinCoordStr;
+
+        public SingleCodonAnnotationsForAllele(String variantCodon, String refCodon, AminoAcid refAA, int codonIndex) {
+            this.variantCodon = variantCodon;
+            AminoAcid variantAA = AminoAcidTable.getEukaryoticAA(this.variantCodon);
+            this.variantAA = variantAA.getCode();
+
+            this.changesAA = !refAA.equals(variantAA);
+
+            if (!this.variantCodon.equals(refCodon)) {
+                if (changesAA) {
+                    if (variantAA.isStop()) {
+                        functionalClass = "nonsense";
+                    }
+                    else if (refAA.isStop()) {
+                        functionalClass = "readthrough";
+                    }
+                    else {
+                        functionalClass = "missense";
+                    }
+                }
+                else { // the same aa:
+                    functionalClass = "silent";
+                }
+            }
+            else { // the same codon:
+                functionalClass = "no_change";
+            }
+
+            this.proteinCoordStr = "p." + refAA.getLetter() + codonIndex + variantAA.getLetter();
+        }
+
+        public String toString() {
+            StringBuilder sb = new StringBuilder();
+
+            sb.append(ALLELE_START);
+            sb.append(REFSEQ_VARIANT_CODON).append(ASSIGNMENT).append(variantCodon).append(CODON_INFO_DELIM);
+            sb.append(REFSEQ_VARIANT_AA).append(ASSIGNMENT).append(variantAA).append(CODON_INFO_DELIM);
+            sb.append(REFSEQ_CHANGES_AA).append(ASSIGNMENT).append(changesAA).append(CODON_INFO_DELIM);
+            sb.append(REFSEQ_FUNCTIONAL_CLASS).append(ASSIGNMENT).append(functionalClass).append(CODON_INFO_DELIM);
+            sb.append(REFSEQ_PROTEIN_COORD_DESCRIPTION).append(ASSIGNMENT).append(proteinCoordStr);
+            sb.append(ALLELE_END);
+
+            return sb.toString();
+        }
+    }
+}
+
+
+// External classes:
+
+class LocusToFeatures {
+    private Map<GenomeLoc, PositionalRefSeqFeatures> locusToFeatures;
+
+    public LocusToFeatures() {
+        this.locusToFeatures = new TreeMap<GenomeLoc, PositionalRefSeqFeatures>();
+    }
+
+    public PositionalRefSeqFeatures getLocusFeatures(GenomeLoc loc) {
+        return locusToFeatures.get(loc);
+    }
+
+    public void putLocusFeatures(GenomeLoc loc, AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) {
+        PositionalRefSeqFeatures locFeatures = locusToFeatures.get(loc);
+        if (locFeatures == null) {
+            locFeatures = new PositionalRefSeqFeatures(locusUsingThis);
+            locusToFeatures.put(loc, locFeatures);
+        }
+        locFeatures.putFeature(refSeqAnnotation, locusUsingThis);
+    }
+
+    public Set<Map.Entry<GenomeLoc, PositionalRefSeqFeatures>> entrySet() {
+        return locusToFeatures.entrySet();
+    }
+
+    public String toString() { // INTERNAL use only
+        StringBuilder sb = new StringBuilder();
+
+        for (Map.Entry<GenomeLoc, PositionalRefSeqFeatures> locFeatures : entrySet()) {
+            GenomeLoc loc = locFeatures.getKey();
+            PositionalRefSeqFeatures features = locFeatures.getValue();
+            sb.append("Locus: ").append(loc).append("\n").append(features);
+        }
+
+        return sb.toString();
+    }
+}
+
+class PositionalRefSeqFeatures {
+    private final static String[] REQUIRE_COLUMNS =
+            {AnnotateMNPsWalker.REFSEQ_NAME, AnnotateMNPsWalker.REFSEQ_POSITION_TYPE};
+
+    private Map<String, PositionalRefSeqFeature> nameToFeature;
+    private GenomeLoc furthestLocusUsingFeatures;
+
+    public PositionalRefSeqFeatures(GenomeLoc locusUsingThis) {
+        this.nameToFeature = new HashMap<String, PositionalRefSeqFeature>();
+        this.furthestLocusUsingFeatures = locusUsingThis;
+    }
+
+    public void putFeature(AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) {
+        for (String column : REQUIRE_COLUMNS) {
+            if (!refSeqAnnotation.containsColumnName(column))
+                throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
+        }
+
+        if (locusUsingThis.isPast(furthestLocusUsingFeatures))
+            furthestLocusUsingFeatures = locusUsingThis;
+
+        String posType = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_POSITION_TYPE);
+        if (!posType.equals(AnnotateMNPsWalker.REFSEQ_CDS)) // only interested in coding sequence annotations
+            return;
+
+        PositionalRefSeqFeature newLocusFeature = new PositionalRefSeqFeature(refSeqAnnotation);
+
+        String refSeqName = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME);
+        PositionalRefSeqFeature locusFeature = nameToFeature.get(refSeqName);
+        if (locusFeature == null) {
+            locusFeature = newLocusFeature;
+            nameToFeature.put(refSeqName, locusFeature);
+        }
+        else if (!locusFeature.equals(newLocusFeature)) {
+            throw new UserException("Inconsistency between previous RefSeq entry and: " + refSeqAnnotation);
+        }
+
+        locusFeature.updateFeature(refSeqAnnotation);
+    }
+
+    public GenomeLoc getFurthestLocusUsingFeatures() {
+        return furthestLocusUsingFeatures;
+    }
+
+    public Set<Map.Entry<String, PositionalRefSeqFeature>> entrySet() {
+        return nameToFeature.entrySet();
+    }
+
+    public String toString() { // INTERNAL use only
+        StringBuilder sb = new StringBuilder();
+
+        for (Map.Entry<String, PositionalRefSeqFeature> nameFeatureEntry : entrySet()) {
+            String name = nameFeatureEntry.getKey();
+            PositionalRefSeqFeature feature = nameFeatureEntry.getValue();
+            sb.append(name).append(" -> [").append(feature).append("]\n");
+        }
+
+        return sb.toString();
+    }
+}
+
+class PositionalRefSeqFeature {
+    private final static String[] REQUIRE_COLUMNS =
+            {AnnotateMNPsWalker.REFSEQ_NAME2, AnnotateMNPsWalker.REFSEQ_STRAND,
+                    AnnotateMNPsWalker.REFSEQ_CODON_COORD, AnnotateMNPsWalker.REFSEQ_CODING_FRAME,
+                    AnnotateMNPsWalker.REFSEQ_REF_CODON, AnnotateMNPsWalker.REFSEQ_REF_AA};
+
+    protected String name2;
+    protected boolean positiveStrand;
+    protected int codonCoord;
+    protected int codingFrame;
+    protected String referenceCodon;
+    protected String referenceAA;
+
+    private Map<String, BaseAnnotations> baseToAnnotations;
+
+    public PositionalRefSeqFeature(AnnotatorInputTableFeature refSeqAnnotation) {
+        for (String column : REQUIRE_COLUMNS) {
+            if (!refSeqAnnotation.containsColumnName(column))
+                throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
+        }
+        this.name2 = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME2);
+        this.positiveStrand = (refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_STRAND).equals(AnnotateMNPsWalker.REFSEQ_POS_STRAND));
+        this.codonCoord = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODON_COORD));
+        this.codingFrame = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODING_FRAME));
+        this.referenceCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_CODON);
+        this.referenceAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_AA);
+
+        this.baseToAnnotations = new HashMap<String, BaseAnnotations>();
+    }
+
+    public boolean equals(PositionalRefSeqFeature that) {
+        return this.name2.equals(that.name2) && this.positiveStrand == that.positiveStrand && this.codonCoord == that.codonCoord && this.codingFrame == that.codingFrame
+                && this.referenceCodon.equals(that.referenceCodon) && this.referenceAA.equals(that.referenceAA);
+    }
+
+    public void updateFeature(AnnotatorInputTableFeature refSeqAnnotation) {
+        if (!refSeqAnnotation.containsColumnName(AnnotateMNPsWalker.REFSEQ_ALT_BASE))
+            throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + AnnotateMNPsWalker.REFSEQ_ALT_BASE);
+        String base = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_ALT_BASE);
+
+        baseToAnnotations.put(base, new BaseAnnotations(refSeqAnnotation));
+    }
+
+    public String toString() { // INTERNAL use only
+        StringBuilder sb = new StringBuilder();
+
+        sb.append("name2= ").append(name2);
+        sb.append(", positiveStrand= ").append(positiveStrand);
+        sb.append(", codonCoord= ").append(codonCoord);
+        sb.append(", codingFrame= ").append(codingFrame);
+        sb.append(", referenceCodon= ").append(referenceCodon);
+        sb.append(", referenceAA= ").append(referenceAA);
+
+        sb.append(", baseAnnotations= {");
+        for (Map.Entry<String, BaseAnnotations> baseToAnnotationsEntry : baseToAnnotations.entrySet()) {
+            String base = baseToAnnotationsEntry.getKey();
+            BaseAnnotations annotations = baseToAnnotationsEntry.getValue();
+            sb.append(" ").append(base).append(" -> {").append(annotations).append("}");
+        }
+        sb.append(" }");
+
+        return sb.toString();
+    }
+}
+
+class BaseAnnotations {
+    private final static String[] REQUIRE_COLUMNS =
+            {AnnotateMNPsWalker.REFSEQ_VARIANT_CODON, AnnotateMNPsWalker.REFSEQ_VARIANT_AA,
+                    AnnotateMNPsWalker.REFSEQ_CHANGES_AA, AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS,
+                    AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION};
+
+    protected String variantCodon;
+    protected String variantAA;
+    protected boolean changesAA;
+    protected String functionalClass;
+    protected String proteinCoordStr;
+
+    public BaseAnnotations(AnnotatorInputTableFeature refSeqAnnotation) {
+        for (String column : REQUIRE_COLUMNS) {
+            if (!refSeqAnnotation.containsColumnName(column))
+                throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
+        }
+        this.variantCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_CODON);
+        this.variantAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_AA);
+        this.changesAA = Boolean.parseBoolean(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CHANGES_AA));
+        this.functionalClass = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS);
+        this.proteinCoordStr = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION);
+    }
+
+
+    public String toString() { // INTERNAL use only
+        StringBuilder sb = new StringBuilder();
+
+        sb.append("variantCodon= ").append(variantCodon);
+        sb.append(", variantAA= ").append(variantAA);
+        sb.append(", changesAA= ").append(changesAA);
+        sb.append(", functionalClass= ").append(functionalClass);
+        sb.append(", proteinCoordStr= ").append(proteinCoordStr);
+
+        return sb.toString();
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java
@ -169,14 +169,8 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
                if (shouldMerge) {
                    numRecordsSatisfyingMergeRule++;
                    VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
-
                    if (mergedVc != null) {
                        mergedRecords = true;
-
-                        Map<String, Object> updatedAttribs = RefSeqData.getMergedRefSeqAttributes(vcfrWaitingToMerge.vc, vc);
-                        updatedAttribs.putAll(mergedVc.getAttributes());
-                        mergedVc = VariantContext.modifyAttributes(mergedVc, updatedAttribs);
-
                        vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true);
                        numMergedRecords++;
                    }
@ -446,132 +440,4 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
            return sb.toString();
        }
    }
-}
-
-
-/* Some methods for extracting and merging RefSeq-related data from annotated VCF INFO fields:
- */
-
-class RefSeqData {
-    private static String REFSEQ_PREFIX = "refseq.";
-
-    private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
-    private static String NAME_KEY = REFSEQ_PREFIX + "name";
-    private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
-
-    private static String CODON_KEY = REFSEQ_PREFIX + "codonCoord";
-
-    private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
-        Map<String, String> entriesToNames = new HashMap<String, String>();
-
-        Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
-        if (numRecords != null) {
-            for (int i = 1; i <= numRecords; i++) {
-                String key = NAME_KEY + "_" + i;
-                String name = vc.getAttributeAsStringNoException(key);
-                if (name != null)
-                    entriesToNames.put(key, name);
-            }
-        }
-        else {
-            String name = vc.getAttributeAsStringNoException(NAME_KEY);
-            if (name != null) {
-                entriesToNames.put(NAME_KEY, name);
-            }
-            else { // Check all INFO fields for a match:
-                for (Map.Entry<String, Object> entry : vc.getAttributes().entrySet()) {
-                    String key = entry.getKey();
-                    if (getName2 && key.startsWith(NAME2_KEY))
-                        entriesToNames.put(key, entry.getValue().toString());
-                    else if (key.startsWith(NAME_KEY) && !key.startsWith(NAME2_KEY))
-                        entriesToNames.put(key, entry.getValue().toString());
-                }
-            }
-        }
-
-        return entriesToNames;
-    }
-
-    private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
-        return getRefSeqEntriesToNames(vc, false);
-    }
-
-    public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
-        return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
-    }
-
-    public static Set<String> getRefSeqNames(VariantContext vc) {
-        return getRefSeqNames(vc, false);
-    }
-
-    public static Map<String, Object> getMergedRefSeqAttributes(VariantContext vc1, VariantContext vc2) {
-        Map<String, Object> refSeqAttribs = new HashMap<String, Object>();
-
-        List<RefSeqEntry> list1 = getAllRefSeqEntries(vc1);
-        List<RefSeqEntry> list2 = getAllRefSeqEntries(vc2);
-        boolean addSuffix = list1.size() > 1 || list2.size() > 1;
-        int count = 1;
-
-        for (RefSeqEntry refseq1 : list1) {
-            for (RefSeqEntry refseq2 : list2) {
-                Set<String> keys = new HashSet<String>();
-                keys.addAll(refseq1.info.keySet());
-                keys.addAll(refseq2.info.keySet());
-
-                String keySuffix = "";
-                if (addSuffix)
-                    keySuffix = "_" + count++;
-
-                Object name1 = refseq1.info.get(NAME_KEY);
-                Object name2 = refseq2.info.get(NAME_KEY);
-                boolean sameGene = name1 != null && name2 != null && name1.equals(name2);
-
-                for (String key : keys) {
-                    Object obj1 = refseq1.info.get(key);
-                    Object obj2 = refseq2.info.get(key);
-                    if (obj1 == null)
-                        obj1 = "";
-                    if (obj2 == null)
-                        obj2 = "";
-
-                    if (sameGene && key.equals(CODON_KEY) && obj1.equals(obj2)) // vc1 and vc2 have variants in the same codon in the same gene
-                        System.out.println(vc1.getChr() + ":" + vc1.getStart() + " --> CODON: obj1 = " + obj1);
-
-                    String useKey = key + keySuffix;
-                    String mergedVal = obj1 + "\\" + obj2;
-                    refSeqAttribs.put(useKey, mergedVal);
-                }
-            }
-        }
-
-        return refSeqAttribs;
-    }
-
-    private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
-        List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
-
-        for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
-            String entry = entryToName.getKey();
-            String entrySuffix = entry.replaceFirst(NAME_KEY, "");
-            allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
-        }
-
-        return allRefSeq;
-    }
-
-    private static class RefSeqEntry {
-        public Map<String, Object> info;
-
-        public RefSeqEntry(VariantContext vc, String entrySuffix) {
-            this.info = new HashMap<String, Object>();
-
-            for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
-                String key = attribEntry.getKey();
-                if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
-                    String genericKey = key.replaceAll(entrySuffix, "");
-                    this.info.put(genericKey, attribEntry.getValue());
-                }
-            }
-        }
-    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java
@ -70,12 +70,12 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
    @Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
    protected boolean disablePrintAlternateAlleleStatistics = false;

-    public final static String IGNORE_CODING = "IGNORE";
-    public final static String UNION_CODING = "UNION";
-    public final static String INTERSECT_CODING = "INTERSECT";
+    public final static String IGNORE_REFSEQ = "IGNORE";
+    public final static String UNION_REFSEQ = "UNION";
+    public final static String INTERSECT_REFSEQ = "INTERSECT";

-    @Argument(fullName = "mergeBasedOnCodingAnnotation", shortName = "mergeBasedOnCodingAnnotation", doc = "'Should merging be performed if two sites lie on the same coding sequence in the INFO field {" + IGNORE_CODING + ", " + UNION_CODING + ", " + INTERSECT_CODING + "}; [default:"+ IGNORE_CODING + "]", required = false)
-    protected String mergeBasedOnCodingAnnotation = IGNORE_CODING;
+    @Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:"+ IGNORE_REFSEQ + "]", required = false)
+    protected String mergeBasedOnRefSeqAnnotation = IGNORE_REFSEQ;

    private LinkedList<String> rodNames = null;

@ -90,10 +90,10 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
        GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser();

        MergeRule mergeRule = null;
-        if (mergeBasedOnCodingAnnotation.equals(IGNORE_CODING))
+        if (mergeBasedOnRefSeqAnnotation.equals(IGNORE_REFSEQ))
            mergeRule = new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser);
        else
-            mergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser, mergeBasedOnCodingAnnotation);
+            mergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser, mergeBasedOnRefSeqAnnotation);

        // false <-> don't take control of writer, since didn't create it:
        vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,genomeLocParser, getToolkit().getArguments().referenceFile, mergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
@ -175,7 +175,7 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
 }


-enum MergeBasedOnCodingAnnotation {
+enum MergeBasedOnRefSeqAnnotation {
    UNION_WITH_DIST, INTERSECT_WITH_DIST
 }

@ -206,45 +206,45 @@ class DistanceMergeRule implements MergeRule {
 }

 class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
-    private MergeBasedOnCodingAnnotation mergeBasedOnCodingAnnotation;
+    private MergeBasedOnRefSeqAnnotation mergeBasedOnRefSeqAnnotation;

-    public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnCodingAnnotation) {
+    public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnRefSeqAnnotation) {
        super(maxGenomicDistanceForMNP, genomeLocParser);

-        if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_CODING))
-            this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.UNION_WITH_DIST;
-        else if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING))
-            this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST;
+        if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ))
+            this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST;
+        else if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ))
+            this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST;
        else
-            throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_CODING + ", " + MergeSegregatingAlternateAllelesWalker.UNION_CODING + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING + " as argument to mergeBasedOnCodingAnnotation!");
+            throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_REFSEQ + ", " + MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ + " as argument to mergeBasedOnRefSeqAnnotation!");
    }

    public boolean shouldMerge(VariantContext vc1, VariantContext vc2) {
        boolean withinDistance = super.shouldMerge(vc1, vc2);

-        if (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST)
+        if (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST)
            return withinDistance || sameGene(vc1, vc2);
-        else // mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST
+        else // mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST
            return withinDistance && sameGene(vc1, vc2);
    }

    private boolean sameGene(VariantContext vc1, VariantContext vc2) {
-        Set<String> names_vc1 = RefSeqData.getRefSeqNames(vc1);
-        Set<String> names_vc2 = RefSeqData.getRefSeqNames(vc2);
+        Set<String> names_vc1 = RefSeqDataParser.getRefSeqNames(vc1);
+        Set<String> names_vc2 = RefSeqDataParser.getRefSeqNames(vc2);
        names_vc1.retainAll(names_vc2);

        if (!names_vc1.isEmpty())
            return true;

        // Check refseq.name2:
-        Set<String> names2_vc1 = RefSeqData.getRefSeqNames(vc1, true);
-        Set<String> names2_vc2 = RefSeqData.getRefSeqNames(vc2, true);
+        Set<String> names2_vc1 = RefSeqDataParser.getRefSeqNames(vc1, true);
+        Set<String> names2_vc2 = RefSeqDataParser.getRefSeqNames(vc2, true);
        names2_vc1.retainAll(names2_vc2);

        return !names2_vc1.isEmpty();
    }

    public String toString() {
-        return super.toString() + " " + (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
+        return super.toString() + " " + (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
@ -543,7 +543,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
        }

        private class EdgeToReads {
-            private Map<GraphEdge, List<String>> edgeReads;
+            private TreeMap<GraphEdge, List<String>> edgeReads;

            public EdgeToReads() {
                this.edgeReads = new TreeMap<GraphEdge, List<String>>(); // implemented GraphEdge.compareTo()
@ -1196,7 +1196,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
        }

        public static PhasingTable marginalizeAsNewTable(PhasingTable table) {
-            Map<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>();
+            TreeMap<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>();
            for (PhasingTable.PhasingTableEntry pte : table) {
                Haplotype rep = pte.getHaplotypeClass().getRepresentative();
                PreciseNonNegativeDouble score = hapMap.get(rep);
@ -1383,7 +1383,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph

    private class MultipleBaseCountsWriter {
        private BufferedWriter writer = null;
-        private Map<SampleReadLocus, MultipleBaseCounts> multipleBaseCounts = null;
+        private TreeMap<SampleReadLocus, MultipleBaseCounts> multipleBaseCounts = null;

        public MultipleBaseCountsWriter(File outputMultipleBaseCountsFile) {
            FileOutputStream output;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java
@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2010, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.phasing;
+
+import org.broad.tribble.util.variantcontext.VariantContext;
+import java.util.*;
+
+/* Some methods for extracting RefSeq-related data from annotated VCF INFO fields:
+ */
+public class RefSeqDataParser {
+    private static String REFSEQ_PREFIX = "refseq.";
+
+    private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
+    private static String NAME_KEY = REFSEQ_PREFIX + "name";
+    private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
+
+    private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
+        String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY;
+        String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
+
+        Map<String, String> entriesToNames = new HashMap<String, String>();
+        Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
+        if (numRecords != null) {
+            for (int i = 1; i <= numRecords; i++) {
+                String key = nameKeyToUseMultiplePrefix + i;
+                String name = vc.getAttributeAsStringNoException(key);
+                if (name != null)
+                    entriesToNames.put(key, name);
+            }
+        }
+        else { // no entry with the # of records:
+            String name = vc.getAttributeAsStringNoException(nameKeyToUse);
+            if (name != null) {
+                entriesToNames.put(nameKeyToUse, name);
+            }
+            else { // Check all INFO fields for a match (if there are multiple entries):
+                for (Map.Entry<String, Object> entry : vc.getAttributes().entrySet()) {
+                    String key = entry.getKey();
+                    if (key.startsWith(nameKeyToUseMultiplePrefix))
+                        entriesToNames.put(key, entry.getValue().toString());
+                }
+            }
+        }
+        return entriesToNames;
+    }
+
+    private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
+        return getRefSeqEntriesToNames(vc, false);
+    }
+
+    public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
+        return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
+    }
+
+    public static Set<String> getRefSeqNames(VariantContext vc) {
+        return getRefSeqNames(vc, false);
+    }
+
+    // Returns a List of SEPARATE Map<refseq.ENTRY, refseq.VALUE> for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc.
+    private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
+        List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
+
+        for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
+            String entry = entryToName.getKey();
+            String entrySuffix = entry.replaceFirst(NAME_KEY, "");
+            allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
+        }
+
+        return allRefSeq;
+    }
+
+    private static class RefSeqEntry {
+        public Map<String, Object> info;
+
+        public RefSeqEntry(VariantContext vc, String entrySuffix) {
+            this.info = new HashMap<String, Object>();
+
+            for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
+                String key = attribEntry.getKey();
+                if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
+                    String genericKey = key.replaceAll(entrySuffix, "");
+                    this.info.put(genericKey, attribEntry.getValue());
+                }
+            }
+        }
+    }
+}