As per Eric and Mark's suggestions, separated the segregating MNP merger (MergeMNPs) from the more general merger employed for annotation purposes (MergeSegregatingAlternateAlleles). Both use the same core MergePhasedSegregatingAlternateAllelesVCFWriter

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4763 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-01 16:42:08 +00:00 · 2010-12-01 16:42:08 +00:00 · b4ef716aaf
parent 984e94521e
commit b4ef716aaf
8 changed files with 461 additions and 78 deletions
--- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java
+++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java
@ -701,13 +701,27 @@ public class VariantContextUtils {
        return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd());
    }

-    // NOTE: returns null if vc1 and vc2 are not mergeable into a single MNP record
-    public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser,VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) {
+    public abstract static class AlleleMergeRule {
+        // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2):
+        abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2);
+
+        public String toString() {
+            return "all samples are mergeable";
+        }
+    }
+
+    // NOTE: returns null if vc1 and vc2 are not merged into a single MNP record
+
+    public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) {
        if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2))
            return null;

-        // Check that it's logically possible to merge the VCs, and that there's a point in doing so (e.g., annotations could be changed):
-        if (!allSamplesAreMergeable(vc1, vc2) || !someSampleHasDoubleNonReferenceAllele(vc1, vc2))
+        // Check that it's logically possible to merge the VCs:
+        if (!allSamplesAreMergeable(vc1, vc2))
+            return null;
+
+        // Check if there's a "point" in merging the VCs (e.g., annotations could be changed)
+        if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2))
            return null;

        return reallyMergeIntoMNP(vc1, vc2, referenceFile);
@ -788,7 +802,7 @@ public class VariantContextUtils {
            if (!(other instanceof AlleleOneAndTwo))
                return false;

-            AlleleOneAndTwo otherAot = (AlleleOneAndTwo)other;
+            AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other;
            return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2));
        }
    }
@ -877,9 +891,9 @@ public class VariantContextUtils {
        return mergedAttribs;
    }

-    private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser,VariantContext vc1, VariantContext vc2) {
-        GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser,vc1);
-        GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser,vc2);
+    private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) {
+        GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1);
+        GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2);

        if (!loc1.onSameContig(loc2))
            throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome");
@ -910,6 +924,7 @@ public class VariantContextUtils {
    }

    // Assumes that vc1 and vc2 were already checked to have the same sample names:
+
    private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) {
        // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1:
        for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) {
@ -948,6 +963,7 @@ public class VariantContextUtils {
    }

    // Assumes that alleleSegregationIsKnown(gt1, gt2):
+
    private static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) {
        if (gt2.genotypesArePhased() || gt2.isHom())
            return new PhaseAndQuality(gt1); // maintain the phase of gt1
@ -972,7 +988,8 @@ public class VariantContextUtils {
    /* Checks if any sample has a MNP of ALT alleles (segregating together):
     [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)]
     */
-    private static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
+
+    public static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
        for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) {
            String sample = gt1Entry.getKey();
            Genotype gt1 = gt1Entry.getValue();
@ -992,4 +1009,47 @@ public class VariantContextUtils {

        return false;
    }
+
+    /* Checks if all samples are consistent in their haplotypes:
+     [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)]
+     */
+
+    public static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) {
+        // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference):
+        Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
+        Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>();
+
+        // Note the segregation of the alleles for the reference genome:
+        allele1ToAllele2.put(vc1.getReference(), vc2.getReference());
+        allele2ToAllele1.put(vc2.getReference(), vc1.getReference());
+
+        // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples).
+        for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) {
+            String sample = gt1Entry.getKey();
+            Genotype gt1 = gt1Entry.getValue();
+            Genotype gt2 = vc2.getGenotype(sample);
+
+            List<Allele> site1Alleles = gt1.getAlleles();
+            List<Allele> site2Alleles = gt2.getAlleles();
+
+            Iterator<Allele> all2It = site2Alleles.iterator();
+            for (Allele all1 : site1Alleles) {
+                Allele all2 = all2It.next();
+
+                Allele all1To2 = allele1ToAllele2.get(all1);
+                if (all1To2 == null)
+                    allele1ToAllele2.put(all1, all2);
+                else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2
+                    return false;
+
+                Allele all2To1 = allele2ToAllele1.get(all2);
+                if (all2To1 == null)
+                    allele2ToAllele1.put(all2, all1);
+                else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1
+                    return false;
+            }
+        }
+
+        return true;
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java
@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2010, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.phasing;
+
+import org.broad.tribble.util.variantcontext.Allele;
+import org.broad.tribble.util.variantcontext.VariantContext;
+import org.broad.tribble.vcf.VCFHeader;
+import org.broad.tribble.vcf.VCFHeaderLine;
+import org.broad.tribble.vcf.VCFWriter;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.vcf.VCFUtils;
+
+import java.util.*;
+
+import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods;
+
+
+/**
+ * Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD.
+ */
+@Allows(value = {DataSource.REFERENCE})
+@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class))
+@By(DataSource.REFERENCE_ORDERED_DATA)
+
+public class MergeMNPsWalker extends RodWalker<Integer, Integer> {
+
+    @Output(doc = "File to which variants should be written", required = true)
+    protected VCFWriter writer = null;
+    private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;
+
+    @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
+    protected int maxGenomicDistanceForMNP = 1;
+
+    private LinkedList<String> rodNames = null;
+
+    public void initialize() {
+        rodNames = new LinkedList<String>();
+        rodNames.add("variant");
+
+        initializeVcfWriter();
+    }
+
+    private void initializeVcfWriter() {
+        // false <-> don't take control of writer, since didn't create it:
+        vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false);
+        writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
+
+        // setup the header fields:
+        Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
+        hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
+        hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
+
+        Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames);
+        vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples())));
+    }
+
+    public boolean generateExtendedEvents() {
+        return false;
+    }
+
+    public Integer reduceInit() {
+        return 0;
+    }
+
+    /**
+     * For each site, send it to be (possibly) merged with previously observed sites.
+     *
+     * @param tracker the meta-data tracker
+     * @param ref     the reference base
+     * @param context the context for the given locus
+     * @return dummy Integer
+     */
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        if (tracker == null)
+            return null;
+
+        boolean requireStartHere = true; // only see each VariantContext once
+        boolean takeFirstOnly = false; // take as many entries as the VCF file has
+        for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly))
+            writeVCF(vc);
+
+        return 0;
+    }
+
+    private void writeVCF(VariantContext vc) {
+        byte refBase;
+        if (!vc.isIndel()) {
+            Allele varAllele = vc.getReference();
+            refBase = SNPallelePair.getSingleBase(varAllele);
+        }
+        else {
+            refBase = vc.getReferenceBaseForIndel();
+        }
+
+        vcMergerWriter.add(vc, refBase);
+    }
+
+    public Integer reduce(Integer result, Integer total) {
+        if (result == null)
+            return total;
+
+        return total + result;
+    }
+
+    /**
+     * Release any VariantContexts not yet processed.
+     *
+     * @param result Empty for now...
+     */
+    public void onTraversalDone(Integer result) {
+        vcMergerWriter.close();
+
+        System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
+        System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
+        System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords());
+        System.out.println(vcMergerWriter.getAltAlleleStats());
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java
@ -42,13 +42,15 @@ import java.util.*;

 // Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts

-public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWriter {
+public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
    private VCFWriter innerWriter;

    private GenomeLocParser genomeLocParser;

    private ReferenceSequenceFile referenceFileForMNPmerging;
-    private MergeRule mergeRule;
+
+    private VariantContextMergeRule vcMergeRule;
+    private VariantContextUtils.AlleleMergeRule alleleMergeRule;

    private String useSingleSample = null;

@ -67,11 +69,12 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
    // Should we call innerWriter.close() in close()
    private boolean takeOwnershipOfInner;

-    public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, MergeRule mergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
+    public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
        this.innerWriter = innerWriter;
        this.genomeLocParser = genomeLocParser;
        this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile);
-        this.mergeRule = mergeRule;
+        this.vcMergeRule = vcMergeRule;
+        this.alleleMergeRule = alleleMergeRule;
        this.useSingleSample = singleSample;
        this.emitOnlyMergedRecords = emitOnlyMergedRecords;

@ -87,8 +90,8 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
        this.takeOwnershipOfInner = takeOwnershipOfInner;
    }

-    public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, int maxGenomicDistanceForMNP, Logger logger) {
-        this(innerWriter, genomeLocParser, referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser), null, false, logger, false, false); // by default: consider all samples, emit all records, don't own inner, don't keep track of alt allele statistics
+    public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, int maxGenomicDistanceForMNP, Logger logger, boolean takeOwnershipOfInner) {
+        this(innerWriter, genomeLocParser, referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser), new SegregatingMNPmergeAllelesRule(), null, false, logger, takeOwnershipOfInner, true); // by default: consider all samples, emit all records, keep track of alt allele statistics
    }

    public void writeHeader(VCFHeader header) {
@ -146,7 +149,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
            }
            else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them:
                numRecordsAttemptToMerge++;
-                boolean shouldMerge = mergeRule.shouldMerge(vcfrWaitingToMerge.vc, vc);
+                boolean shouldAttemptToMerge = vcMergeRule.shouldAttemptToMerge(vcfrWaitingToMerge.vc, vc);

                /*
                TODO: -- CONSIDER THE FOLLOWING EXAMPLE: WHAT DO WE WANT HERE??? --
@ -163,14 +166,20 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
                But, since we merged sites 1 and 2, we get that sites 1-2 and 3 are counted as two haplotypes of: ALT-REF and ALT-ALT
                 */
                if (altAlleleStats != null)
-                    altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, shouldMerge);
+                    altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, shouldAttemptToMerge);

                boolean mergedRecords = false;
-                if (shouldMerge) {
+                if (shouldAttemptToMerge) {
                    numRecordsSatisfyingMergeRule++;
-                    VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
+                    VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule);
+
                    if (mergedVc != null) {
                        mergedRecords = true;
+
+                        Map<String, Object> addedAttribs = vcMergeRule.addToMergedAttributes(vcfrWaitingToMerge.vc, vc);
+                        addedAttribs.putAll(mergedVc.getAttributes());
+                        mergedVc = VariantContext.modifyAttributes(mergedVc, addedAttribs);
+
                        vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true);
                        numMergedRecords++;
                    }
@ -213,8 +222,12 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
        return numMergedRecords;
    }

-    public MergeRule getMergeRule() {
-        return mergeRule;
+    public VariantContextMergeRule getVcMergeRule() {
+        return vcMergeRule;
+    }
+
+    public VariantContextUtils.AlleleMergeRule getAlleleMergeRule() {
+        return alleleMergeRule;
    }

    /**
@ -248,7 +261,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite

    private class AltAlleleStats {
        public int numSuccessiveGenotypes;
-        public int numSuccessiveGenotypesThatCouldBeMerged;
+        public int numSuccessiveGenotypesAttemptedToBeMerged;

        public int oneSampleMissing;
        public int atLeastOneSampleNotCalledOrFiltered;
@ -267,7 +280,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite

        public AltAlleleStats() {
            this.numSuccessiveGenotypes = 0;
-            this.numSuccessiveGenotypesThatCouldBeMerged = 0;
+            this.numSuccessiveGenotypesAttemptedToBeMerged = 0;

            this.oneSampleMissing = 0;
            this.atLeastOneSampleNotCalledOrFiltered = 0;
@ -292,11 +305,11 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
            sb.append("Not called or filtered:\t" + atLeastOneSampleNotCalledOrFiltered + "\n");

            sb.append("* Number of successive pairs of genotypes:\t" + numSuccessiveGenotypes + "\n");
-            sb.append("Number of successive pairs of genotypes with " + mergeRule + ":\t" + numSuccessiveGenotypesThatCouldBeMerged + "\n");
+            sb.append("Number of successive pairs of genotypes with " + vcMergeRule + ":\t" + numSuccessiveGenotypesAttemptedToBeMerged + "\n");

-            sb.append("Unknown segregation, " + mergeRule + ":\t" + segregationUnknown + "\n");
-            sb.append("Not variant at least one of pair, segregation known, " + mergeRule + ":\t" + eitherNotVariant + "\n");
-            sb.append("* Variant at both, segregation known, " + mergeRule + ":\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n");
+            sb.append("Unknown segregation, " + vcMergeRule + ":\t" + segregationUnknown + "\n");
+            sb.append("Not variant at least one of pair, segregation known, " + vcMergeRule + ":\t" + eitherNotVariant + "\n");
+            sb.append("* Variant at both, segregation known, " + vcMergeRule + ":\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n");

            sb.append("[Total haplotypes at pairs:\t" + (ref_ref_pair + ref_alt_pair + alt_ref_pair + alt_alt_pair) + "\n");
            sb.append("REF-REF:\t" + ref_ref_pair + "\n");
@ -326,7 +339,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
            this.sampleStats = new HashMap<String, AltAlleleStats>();
        }

-        public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean shouldMerge) {
+        public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean shouldAttemptToMerge) {
            if (vc1.isFiltered() || vc2.isFiltered())
                return;

@ -351,8 +364,8 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
                else {
                    aas.numSuccessiveGenotypes++;

-                    if (shouldMerge) {
-                        aas.numSuccessiveGenotypesThatCouldBeMerged++;
+                    if (shouldAttemptToMerge) {
+                        aas.numSuccessiveGenotypesAttemptedToBeMerged++;

                        if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) {
                            aas.segregationUnknown++;
@ -428,7 +441,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("-------------------------------------------------------------------------\n");
-            sb.append("Per-sample alternate allele statistics [" + mergeRule + "]\n");
+            sb.append("Per-sample alternate allele statistics [" + vcMergeRule + "]\n");
            sb.append("-------------------------------------------------------------------------");

            for (Map.Entry<String, AltAlleleStats> sampAltAllStatsEntry : sampleStats.entrySet()) {
@ -440,4 +453,66 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
            return sb.toString();
        }
    }
+}
+
+
+
+/*
+ External classes:
+ */
+
+abstract class VariantContextMergeRule {
+    abstract public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2);
+
+    public Map<String, Object> addToMergedAttributes(VariantContext vc1, VariantContext vc2) {
+        return new HashMap<String, Object>();
+    }
+}
+
+class DistanceMergeRule extends VariantContextMergeRule {
+    private int maxGenomicDistanceForMNP;
+    private GenomeLocParser genomeLocParser;
+
+    public DistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser) {
+        this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP;
+        this.genomeLocParser = genomeLocParser;
+    }
+
+    public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2) {
+        return minDistance(vc1, vc2) <= maxGenomicDistanceForMNP;
+    }
+
+    public String toString() {
+        return "Merge distance <= " + maxGenomicDistanceForMNP;
+    }
+
+    public int minDistance(VariantContext vc1, VariantContext vc2) {
+        return VariantContextUtils.getLocation(genomeLocParser, vc1).minDistance(VariantContextUtils.getLocation(genomeLocParser, vc2));
+    }
+}
+
+
+class ExistsDoubleAltAlleleMergeRule extends VariantContextUtils.AlleleMergeRule {
+    public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
+        return VariantContextUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2);
+    }
+
+    public String toString() {
+        return super.toString() + ", some sample has a MNP of ALT alleles";
+    }
+}
+
+class SegregatingMNPmergeAllelesRule extends ExistsDoubleAltAlleleMergeRule {
+    public SegregatingMNPmergeAllelesRule() {
+        super();
+    }
+
+    public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
+        // Must be interesting AND consistent:
+        return super.allelesShouldBeMerged(vc1, vc2) && VariantContextUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2);
+    }
+
+    public String toString() {
+        return super.toString() + ", all alleles segregate consistently";
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java
@ -45,7 +45,7 @@ import java.util.*;
 import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods;

 /**
- * Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD.
+ * Walks along all variant ROD loci, and merges consecutive sites if some sample has segregating alt alleles in the ROD.
 */
@Allows(value = {DataSource.REFERENCE})
@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class))
@ -55,10 +55,10 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I

    @Output(doc = "File to which variants should be written", required = true)
    protected VCFWriter writer = null;
-    private MergePhasedSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;
+    private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;

-    @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
-    protected int maxGenomicDistanceForMNP = 1;
+    @Argument(fullName = "maxGenomicDistance", shortName = "maxDist", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records; [default:1]", required = false)
+    protected int maxGenomicDistance = 1;

    @Argument(fullName = "useSingleSample", shortName = "useSample", doc = "Only output genotypes for the single sample given; [default:use all samples]", required = false)
    protected String useSingleSample = null;
@ -74,9 +74,12 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
    public final static String UNION_REFSEQ = "UNION";
    public final static String INTERSECT_REFSEQ = "INTERSECT";

-    @Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:"+ IGNORE_REFSEQ + "]", required = false)
+    @Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:" + IGNORE_REFSEQ + "]", required = false)
    protected String mergeBasedOnRefSeqAnnotation = IGNORE_REFSEQ;

+    @Argument(fullName = "dontRequireSomeSampleHasDoubleAltAllele", shortName = "dontRequireSomeSampleHasDoubleAltAllele", doc = "Should the requirement, that SUCCESSIVE records to be merged have at least one sample with a double alternate allele, be relaxed?; [default:false]", required = false)
+    protected boolean dontRequireSomeSampleHasDoubleAltAllele = false;
+
    private LinkedList<String> rodNames = null;

    public void initialize() {
@ -89,14 +92,20 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
    private void initializeVcfWriter() {
        GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser();

-        MergeRule mergeRule = null;
+        VariantContextMergeRule vcMergeRule;
        if (mergeBasedOnRefSeqAnnotation.equals(IGNORE_REFSEQ))
-            mergeRule = new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser);
+            vcMergeRule = new DistanceMergeRule(maxGenomicDistance, genomeLocParser);
        else
-            mergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser, mergeBasedOnRefSeqAnnotation);
+            vcMergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistance, genomeLocParser, mergeBasedOnRefSeqAnnotation);
+
+        VariantContextUtils.AlleleMergeRule alleleMergeRule;
+        if (dontRequireSomeSampleHasDoubleAltAllele) // if a pair of VariantContext passes the vcMergeRule, then always merge them:
+            alleleMergeRule = new AlwaysMergeAllelesRule();
+        else
+            alleleMergeRule = new ExistsDoubleAltAlleleMergeRule();

        // false <-> don't take control of writer, since didn't create it:
-        vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,genomeLocParser, getToolkit().getArguments().referenceFile, mergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
+        vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, genomeLocParser, getToolkit().getArguments().referenceFile, vcMergeRule, alleleMergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
        writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]

        // setup the header fields:
@ -168,8 +177,8 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
            System.out.println("Only considered single sample: " + useSingleSample);

        System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
-        System.out.println("Number of potentially merged records (" + vcMergerWriter.getMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
-        System.out.println("Number of records merged [all samples are mergeable, some sample has a MNP of ALT alleles]: " + vcMergerWriter.getNumMergedRecords());
+        System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
+        System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords());
        System.out.println(vcMergerWriter.getAltAlleleStats());
    }
 }
@ -179,32 +188,6 @@ enum MergeBasedOnRefSeqAnnotation {
    UNION_WITH_DIST, INTERSECT_WITH_DIST
 }

-interface MergeRule {
-    public boolean shouldMerge(VariantContext vc1, VariantContext vc2);
-}
-
-class DistanceMergeRule implements MergeRule {
-    private int maxGenomicDistanceForMNP;
-    private GenomeLocParser genomeLocParser;
-
-    public DistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser) {
-        this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP;
-        this.genomeLocParser = genomeLocParser;
-    }
-
-    public boolean shouldMerge(VariantContext vc1, VariantContext vc2) {
-        return minDistance(vc1, vc2) <= maxGenomicDistanceForMNP;
-    }
-
-    public String toString() {
-        return "Merge distance <= " + maxGenomicDistanceForMNP;
-    }
-
-    public int minDistance(VariantContext vc1, VariantContext vc2) {
-        return VariantContextUtils.getLocation(genomeLocParser,vc1).minDistance(VariantContextUtils.getLocation(genomeLocParser,vc2));
-    }
-}
-
 class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
    private MergeBasedOnRefSeqAnnotation mergeBasedOnRefSeqAnnotation;

@ -219,8 +202,8 @@ class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
            throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_REFSEQ + ", " + MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ + " as argument to mergeBasedOnRefSeqAnnotation!");
    }

-    public boolean shouldMerge(VariantContext vc1, VariantContext vc2) {
-        boolean withinDistance = super.shouldMerge(vc1, vc2);
+    public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2) {
+        boolean withinDistance = super.shouldAttemptToMerge(vc1, vc2);

        if (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST)
            return withinDistance || sameGene(vc1, vc2);
@ -247,4 +230,18 @@ class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
    public String toString() {
        return super.toString() + " " + (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
    }
+
+    public Map<String, Object> addToMergedAttributes(VariantContext vc1, VariantContext vc2) {
+        Map<String, Object> addedAttribs = super.addToMergedAttributes(vc1, vc2);
+        addedAttribs.putAll(RefSeqDataParser.getMergedRefSeqNameAttributes(vc1, vc2));
+        return addedAttribs;
+    }
+}
+
+
+
+class AlwaysMergeAllelesRule extends VariantContextUtils.AlleleMergeRule {
+    public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
+        return true;
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
@ -138,8 +138,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
        // Wrapper VCFWriters will take ownership of inner writers iff: inner writer != origWriter [which wasn't created here]
        VCFWriter origWriter = writer;

-        if (enableMergePhasedSegregatingPolymorphismsToMNP) // null <-> use ALL samples, false <-> emit all records, false <-> don't track the statistics of alternate alleles being merged:
-            writer = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, getToolkit().getGenomeLocParser()), null, false, logger, writer != origWriter, false);
+        if (enableMergePhasedSegregatingPolymorphismsToMNP)
+            writer = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, writer != origWriter);

        /* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow
           Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java
@ -25,6 +25,7 @@
 package org.broadinstitute.sting.gatk.walkers.phasing;

 import org.broad.tribble.util.variantcontext.VariantContext;
+
 import java.util.*;

 /* Some methods for extracting RefSeq-related data from annotated VCF INFO fields:
@ -36,6 +37,8 @@ public class RefSeqDataParser {
    private static String NAME_KEY = REFSEQ_PREFIX + "name";
    private static String NAME2_KEY = REFSEQ_PREFIX + "name2";

+    private static String[] NAME_KEYS = {NAME_KEY, NAME2_KEY};
+
    private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
        String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY;
        String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
@ -78,6 +81,57 @@ public class RefSeqDataParser {
        return getRefSeqNames(vc, false);
    }

+    public static Map<String, Object> getMergedRefSeqNameAttributes(VariantContext vc1, VariantContext vc2) {
+        Map<String, Object> refSeqNameAttribs = new HashMap<String, Object>();
+
+        Map<String, RefSeqEntry> entriesMap1 = getAllRefSeqEntriesByName(vc1);
+        Map<String, RefSeqEntry> entriesMap2 = getAllRefSeqEntriesByName(vc2);
+
+        Set<String> commonNames = entriesMap1.keySet();
+        commonNames.retainAll(entriesMap2.keySet());
+        boolean addSuffix = commonNames.size() > 1;
+        int count = 1;
+
+        for (String name : commonNames) {
+            RefSeqEntry refseq1 = entriesMap1.get(name);
+            RefSeqEntry refseq2 = entriesMap2.get(name);
+
+            String keySuffix = "";
+            if (addSuffix)
+                keySuffix = "_" + count;
+
+            boolean added = false;
+            for (String key : NAME_KEYS) {
+                Object obj1 = refseq1.info.get(key);
+                Object obj2 = refseq2.info.get(key);
+                if (obj1 != null && obj2 != null && obj1.equals(obj2)) {
+                    added = true;
+                    String useKey = key + keySuffix;
+                    refSeqNameAttribs.put(useKey, obj1);
+                }
+            }
+            if (added)
+                count++;
+        }
+        if (count > 1)
+            refSeqNameAttribs.put(NUM_RECORDS_KEY, count - 1); // since incremented count one extra time
+
+        return refSeqNameAttribs;
+    }
+
+    private static Map<String, RefSeqEntry> getAllRefSeqEntriesByName(VariantContext vc) {
+        Map<String, RefSeqEntry> nameToEntries = new TreeMap<String, RefSeqEntry>();
+
+        List<RefSeqEntry> allEntries = getAllRefSeqEntries(vc);
+        for (RefSeqEntry entry : allEntries) {
+            Object name = entry.info.get(NAME_KEY);
+            if (name != null)
+                nameToEntries.put(name.toString(), entry);
+        }
+
+        return nameToEntries;
+    }
+
    // Returns a List of SEPARATE Map<refseq.ENTRY, refseq.VALUE> for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc.
    private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
        List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
--- a/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java
@ -0,0 +1,51 @@
+package org.broadinstitute.sting.gatk.walkers.phasing;
+
+import org.broadinstitute.sting.WalkerTest;
+import org.testng.annotations.Test;
+
+import java.util.Arrays;
+
+public class MergeMNPsIntegrationTest extends WalkerTest {
+
+    public static String baseTestString(String reference, String VCF, int maxDistMNP) {
+        return "-T MergeMNPs" +
+                " -R " + reference +
+                " -B:variant,VCF " + validationDataLocation + VCF +
+                " --maxGenomicDistanceForMNP " + maxDistMNP +
+                " -o %s" +
+                " -NO_HEADER";
+    }
+
+
+    @Test
+    public void test1() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1)
+                        + " -L chr20:556259-756570",
+                1,
+                Arrays.asList("19d0b2361367024bb9a83b9c15ef2453"));
+        executeTest("Merge MNP sites within genomic distance of 1 [TEST ONE]", spec);
+    }
+
+    @Test
+    public void test2() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10)
+                        + " -L chr20:556259-756570",
+                1,
+                Arrays.asList("f25a6403579dab1395773b3ba365c327"));
+        executeTest("Merge MNP sites within genomic distance of 10 [TEST TWO]", spec);
+    }
+
+    @Test
+    public void test3() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100)
+                        + " -L chr20:556259-756570",
+                1,
+                Arrays.asList("a064955ffeea7fc4e09512f3e9cdbb9e"));
+        executeTest("Merge MNP sites within genomic distance of 100 [TEST THREE]", spec);
+    }
+
+
+}
--- a/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java
@ -7,11 +7,11 @@ import java.util.Arrays;

 public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest {

-    public static String baseTestString(String reference, String VCF, int maxDistMNP) {
+    public static String baseTestString(String reference, String VCF, int maxDist) {
        return "-T MergeSegregatingAlternateAlleles" +
                " -R " + reference +
                " -B:variant,VCF " + validationDataLocation + VCF +
-                " --maxGenomicDistanceForMNP " + maxDistMNP +
+                " --maxGenomicDistance " + maxDist +
                " -o %s" +
                " -NO_HEADER";
    }
@ -24,7 +24,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest
                        + " -L chr20:556259-756570",
                1,
                Arrays.asList("e6a14fc97dbd0aaa8e6a4d9a7f1616a6"));
-        executeTest("Merge MNP het sites within genomic distance of 1 [TEST ONE]", spec);
+        executeTest("Merge sites within genomic distance of 1 [TEST ONE]", spec);
    }

    @Test
@ -34,7 +34,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest
                        + " -L chr20:556259-756570",
                1,
                Arrays.asList("cc2b45c85a51b4998e30758c48f61940"));
-        executeTest("Merge MNP het sites within genomic distance of 10 [TEST TWO]", spec);
+        executeTest("Merge sites within genomic distance of 10 [TEST TWO]", spec);
    }

    @Test
@ -44,8 +44,8 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest
                        + " -L chr20:556259-756570",
                1,
                Arrays.asList("47300cc7a5a7d84b3c279f04c4567739"));
-        executeTest("Merge MNP het sites within genomic distance of 100 [TEST THREE]", spec);
+        executeTest("Merge sites within genomic distance of 100 [TEST THREE]", spec);
    }


-}
+}