Reworking of the dangling tails merging code.

We now run Smith-Waterman on the dangling tail against the corresponding reference tail. If we can generate a reasonable, low entropy alignment then we trigger the merge to the reference path; otherwise we abort. Also, we put in a check for low-complexity of graphs and don't let those pass through. Added tests for this implementation that checks exact SW results and correct edges added.
2013-06-05 14:26:23 -04:00 · 2013-06-05 14:26:23 -04:00 · dadcfe296d
parent c0030f3f2d
commit dadcfe296d
11 changed files with 339 additions and 60 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
@ -676,6 +676,24 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
                '}';
    }

+    /**
+     * The base sequence for the given path.
+     * Note, this assumes that the path does not start with a source node.
+     *
+     * @param path the list of vertexes that make up the path
+     * @return  non-null sequence of bases corresponding to the given path
+     */
+    @Ensures({"result != null"})
+    public byte[] getBasesForPath(final List<? extends DeBruijnVertex> path) {
+        if ( path == null ) throw new IllegalArgumentException("Path cannot be null");
+
+        final StringBuffer sb = new StringBuffer();
+        for ( final DeBruijnVertex v : path )
+            sb.append((char)v.getSuffix());
+
+        return sb.toString().getBytes();
+    }
+
    /**
     * Get the set of vertices within distance edges of source, regardless of edge direction
     *
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java
@ -171,7 +171,15 @@ final public class GraphUtils {
        return foundDup ? null : new PrimitivePair.Int(longestPos, length);
    }

-    private static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
+    /**
+     * calculates the longest suffix match between a sequence and a smaller kmer
+     *
+     * @param seq         the (reference) sequence
+     * @param kmer        the smaller kmer sequence
+     * @param seqStart    the index (inclusive) on seq to start looking backwards from
+     * @return the longest matching suffix
+     */
+    public static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
        for ( int len = 1; len <= kmer.length; len++ ) {
            final int seqI = seqStart - len + 1;
            final int kmerI = kmer.length - len;
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java
@ -47,7 +47,6 @@
 package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;

 import com.google.java.contract.Ensures;
-import com.google.java.contract.Requires;
 import net.sf.samtools.Cigar;
 import net.sf.samtools.CigarElement;
 import net.sf.samtools.CigarOperator;
@ -92,7 +91,7 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
    /**
     * Create a new Path containing no edges and starting at initialVertex
     * @param initialVertex the starting vertex of the path
-     * @param graph the graph this path with follow through
+     * @param graph the graph this path will follow through
     */
    public Path(final T initialVertex, final BaseGraph<T, E> graph) {
        if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null");
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
@ -55,7 +55,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;

 import java.io.File;
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;

@ -89,7 +88,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
    }

    public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
-        this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, false);
+        this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true);
    }

    /** for testing purposes */
@ -103,7 +102,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {

        // first, try using the requested kmer sizes
        for ( final int kmerSize : kmerSizes ) {
-            final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
+            final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles);
            if ( graph != null )
                graphs.add(graph);
        }
@ -113,7 +112,8 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
            int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE;
            int numIterations = 1;
            while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
-                final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
+                // on the last attempt we will allow low complexity graphs
+                final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT);
                if ( graph != null )
                    graphs.add(graph);
                kmerSize += KMER_SIZE_ITERATION_INCREASE;
@ -131,9 +131,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
     * @param refHaplotype     reference haplotype
     * @param kmerSize         kmer size
     * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
-     * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths)
+     * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
+     * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
     */
-    protected SeqGraph createGraph(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final int kmerSize, final List<Haplotype> activeAlleleHaplotypes) {
+    protected SeqGraph createGraph(final List<GATKSAMRecord> reads,
+                                   final Haplotype refHaplotype,
+                                   final int kmerSize,
+                                   final List<Haplotype> activeAlleleHaplotypes,
+                                   final boolean allowLowComplexityGraphs) {
        final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);

        // add the reference sequence to the graph
@ -157,7 +162,13 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {

        // sanity check: make sure there are no cycles in the graph
        if ( rtgraph.hasCycles() ) {
-            if ( debug ) logger.info("Not using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler because it contains a cycle");
+            if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle");
+            return null;
+        }
+
+        // sanity check: make sure the graph had enough complexity with the given kmer
+        if ( ! allowLowComplexityGraphs && rtgraph.isLowComplexity() ) {
+            if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity");
            return null;
        }

@ -169,8 +180,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
        rtgraph.pruneLowWeightChains(pruneFactor);

        // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
-        // we can recover them by merging some N bases from the chain back into the reference uniquely, for
-        // N < kmerSize
+        // we can recover them by merging some N bases from the chain back into the reference
        if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();

        // remove all heading and trailing paths
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
@ -46,14 +46,19 @@

 package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;

+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.collections.Pair;
-import org.broadinstitute.sting.utils.collections.PrimitivePair;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
+import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman;
 import org.jgrapht.EdgeFactory;
 import org.jgrapht.alg.CycleDetector;

@ -80,9 +85,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
    /** for debugging info printing */
    private static int counter = 0;

-    /** we require at least this many bases to be uniquely matching to merge a dangling tail */
-    private final static int MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL = 5;
-
    /**
     * Sequences added for read threading before we've actually built the graph
     */
@ -96,7 +98,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
    /**
     * A map from kmers -> their corresponding vertex in the graph
     */
-    private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<Kmer, MultiDeBruijnVertex>();
+    private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<>();

    /**
     *
@ -113,8 +115,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
    // --------------------------------------------------------------------------------
    private Kmer refSource;
    private boolean alreadyBuilt;
-    byte[] refSeq;
-    MultiDeBruijnVertex[] refKmers;

    public ReadThreadingGraph() {
        this(25, false, (byte)6);
@ -148,8 +148,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
        uniqueKmers.clear();
        refSource = null;
        alreadyBuilt = false;
-        refSeq = null;
-        refKmers = null;
    }

    /**
@ -226,13 +224,10 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp

        if ( debugGraphTransformations ) startingVertex.addRead(seqForKmers.name);

-        // keep track of information about the reference kmers for merging dangling tails
+        // keep track of information about the reference source
        if ( seqForKmers.isRef ) {
-            if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev " + refSource + " new is " + startingVertex);
+            if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev: " + refSource + ", new: " + startingVertex);
            refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize);
-            refSeq = seqForKmers.sequence;
-            refKmers = new MultiDeBruijnVertex[refSeq.length];
-            for ( int i = 0; i < kmerSize; i++ ) refKmers[i] = null;
        }

        // loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate
@ -242,54 +237,161 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp

            vertex = extendChainByOne(vertex, seqForKmers.sequence, i, count, seqForKmers.isRef);
            if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name);
-
-            // keep track of the reference kmers for merging dangling tails
-            if ( seqForKmers.isRef ) refKmers[i + kmerSize - 1] = vertex;
        }
    }

    /**
-     * Attempt to attach vertex with out-degree == 0 to the graph by finding a unique matching kmer to the reference
+     * Class to keep track of the important dangling tail merging data
+     */
+    protected final class DanglingTailMergeResult {
+        final List<MultiDeBruijnVertex> danglingPath, referencePath;
+        final byte[] danglingPathString, referencePathString;
+        final Cigar cigar;
+
+        public DanglingTailMergeResult(final List<MultiDeBruijnVertex> danglingPath,
+                                       final List<MultiDeBruijnVertex> referencePath,
+                                       final byte[] danglingPathString,
+                                       final byte[] referencePathString,
+                                       final Cigar cigar) {
+            this.danglingPath = danglingPath;
+            this.referencePath = referencePath;
+            this.danglingPathString = danglingPathString;
+            this.referencePathString = referencePathString;
+            this.cigar = cigar;
+        }
+    }
+
+    /**
+     * Attempt to attach vertex with out-degree == 0 to the graph
+     *
     * @param vertex the vertex to recover
+     * @return 1 if we successfully recovered the vertex and 0 otherwise
     */
    protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) {
        if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0");

-        final byte[] kmer = vertex.getSequence();
-        if ( ! nonUniqueKmers.contains(new Kmer(kmer)) ) {
-            // don't attempt to fix non-unique kmers!
-            final MultiDeBruijnVertex uniqueMergePoint = danglingTailMergePoint(kmer);
-            if ( uniqueMergePoint != null ) {
-                addEdge(vertex, uniqueMergePoint, new MultiSampleEdge(false, 1));
-                return 1;
-            }
-        }
+        // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
+        final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex);

-        return 0;
+        // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path
+        if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) )
+            return 0;
+
+        // merge
+        return mergeDanglingTail(danglingTailMergeResult);
    }

    /**
-     * Find a unique merge point for kmer in the reference sequence
-     * @param kmer the full kmer of the dangling tail
-     * @return a vertex appropriate to merge kmer into, or null if none could be found
+     * Determine whether the provided cigar is okay to merge into the reference path
+     *
+     * @param cigar    the cigar to analyze
+     * @return true if it's okay to merge, false otherwise
     */
-    private MultiDeBruijnVertex danglingTailMergePoint(final byte[] kmer) {
-        final PrimitivePair.Int endAndLength = GraphUtils.findLongestUniqueSuffixMatch(refSeq, kmer);
-        if ( endAndLength != null && endAndLength.second >= MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL && endAndLength.first + 1 < refKmers.length) {
-            final int len = endAndLength.second;
-            final MultiDeBruijnVertex mergePoint = refKmers[endAndLength.first + 1];
-//            logger.info("recoverDanglingChain of kmer " + new String(kmer) + " merged to " + mergePoint + " with match size " + len);
-            final Set<Kmer> nonUniquesAtLength = determineKmerSizeAndNonUniques(len, len).nonUniques;
-            final Kmer matchedKmer = new Kmer(kmer, kmer.length - len, len);
-            if ( nonUniquesAtLength.contains(matchedKmer) ) {
-//                logger.info("Rejecting merge " + new String(kmer) + " because match kmer " + matchedKmer + " isn't unique across all reads");
-                return null;
-            } else {
-                return mergePoint;
-            }
+    protected boolean cigarIsOkayToMerge(final Cigar cigar) {
+
+        final List<CigarElement> elements = cigar.getCigarElements();
+
+        // don't allow more than a couple of different ops
+        if ( elements.size() > 3 )
+            return false;
+
+        // the last element must be an M
+        if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M )
+            return false;
+
+        // TODO -- do we want to check whether the Ms mismatch too much also?
+
+        return true;
+    }
+
+    /**
+     * Actually merge the dangling tail if possible
+     *
+     * @param danglingTailMergeResult   the result from generating a Cigar for the dangling tail against the reference
+     * @return 1 if merge was successful, 0 otherwise
+     */
+    protected int mergeDanglingTail(final DanglingTailMergeResult danglingTailMergeResult) {
+
+        final List<CigarElement> elements = danglingTailMergeResult.cigar.getCigarElements();
+        final CigarElement lastElement = elements.get(elements.size() - 1);
+        if ( lastElement.getOperator() != CigarOperator.M )
+            throw new IllegalArgumentException("The last Cigar element must be an M");
+
+        final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1;
+        final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength());
+        if ( matchingSuffix == 0 )
+            return 0;
+
+        final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0);
+        final int refIndexToMerge = lastRefIndex - matchingSuffix + 1;
+        addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), new MultiSampleEdge(false, 1));
+        return 1;
+    }
+
+    /**
+     * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
+     * provided vertex is the sink) and the reference path.
+     *
+     * @param vertex   the sink of the dangling tail
+     * @return a SmithWaterman object which can be null if no proper alignment could be generated
+     */
+    protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex) {
+
+        // find the lowest common ancestor path between vertex and the reference sink if available
+        final List<MultiDeBruijnVertex> altPath = findPathToLowestCommonAncestorOfReference(vertex);
+        if ( altPath == null )
+            return null;
+
+        // now get the reference path from the LCA
+        final List<MultiDeBruijnVertex> refPath = getReferencePath(altPath.get(0));
+
+        // create the Smith-Waterman strings to use
+        final byte[] refBases = getBasesForPath(refPath);
+        final byte[] altBases = getBasesForPath(altPath);
+
+        // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting)
+        final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWPairwiseAlignment.OVERHANG_STRATEGY.INDEL);
+        return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar()));
+    }
+
+    /**
+     * Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex
+     *
+     * @param vertex   the original vertex
+     * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
+     *  has an ancestor with multiple incoming edges before hitting the reference path
+     */
+    protected List<MultiDeBruijnVertex> findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) {
+        final LinkedList<MultiDeBruijnVertex> path = new LinkedList<>();
+
+        MultiDeBruijnVertex v = vertex;
+        while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) {
+            path.addFirst(v);
+            v = getEdgeSource(incomingEdgeOf(v));
+        }
+        path.addFirst(v);
+
+        return isReferenceNode(v) ? path : null;
+    }
+
+    /**
+     * Finds the path downwards in the graph from this vertex to the reference sink, including this vertex
+     *
+     * @param start   the reference vertex to start from
+     * @return the path (non-null, non-empty)
+     */
+    protected List<MultiDeBruijnVertex> getReferencePath(final MultiDeBruijnVertex start) {
+        if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path");
+
+        final List<MultiDeBruijnVertex> path = new ArrayList<>();
+
+        MultiDeBruijnVertex v = start;
+        while ( v != null ) {
+            path.add(v);
+            v = getNextReferenceVertex(v);
        }

-        return null;
+        return path;
    }

    /**
@ -330,6 +432,16 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
        return new CycleDetector<>(this).detectCycles();
    }

+    /**
+     * Does the graph not have enough complexity?  We define low complexity as a situation where the number
+     * of non-unique kmers is more than 20% of the total number of kmers.
+     *
+     * @return true if the graph has low complexity, false otherwise
+     */
+    public boolean isLowComplexity() {
+        return nonUniqueKmers.size() * 4 > uniqueKmers.size();
+    }
+
    public void recoverDanglingTails() {
        if ( ! alreadyBuilt )  throw new IllegalStateException("recoverDanglingTails requires the graph be already built");

@ -341,7 +453,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
                nRecovered += recoverDanglingChain(v);
            }
        }
-        //logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails");
+
+        if ( debugGraphTransformations ) logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails");
    }

    /** structure that keeps track of the non-unique kmers for a given kmer size */
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java
@ -312,4 +312,19 @@ public class BaseGraphUnitTest extends BaseTest {

        Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
    }
+
+    @Test(enabled = true)
+    public void testGetBases() {
+
+        final int kmerSize = 4;
+        final String testString = "AATGGGGGCAATACTA";
+
+        final List<DeBruijnVertex> vertexes = new ArrayList<>();
+        for ( int i = 0; i <= testString.length() - kmerSize; i++ ) {
+            vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize)));
+        }
+
+        final String result = new String(new DeBruijnGraph().getBasesForPath(vertexes));
+        Assert.assertEquals(result, testString.substring(kmerSize - 1));
+    }
 }
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java
@ -83,7 +83,8 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest {
        }

        public SeqGraph assemble() {
-            assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests
+            assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests
+            assembler.setRecoverDanglingTails(false); // needed to pass some of the tests
            assembler.setDebugGraphTransformations(true);
            final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.<Haplotype>emptyList()).get(0);
            if ( DEBUG ) graph.printGraph(new File("test.dot"), 0);
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
@ -53,6 +53,7 @@ import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.testng.Assert;
+import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;

 import java.io.File;
@ -201,6 +202,81 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
        Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1);
    }

+    @DataProvider(name = "DanglingTails")
+    public Object[][] makeDanglingTailsData() {
+        List<Object[]> tests = new ArrayList<Object[]>();
+
+        // add 1M to the expected CIGAR because it includes the previous (common) base too
+        tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3});                  // incomplete haplotype
+        tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10});     // insertion
+        tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10});    // deletion
+        tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7});                // 1 snp
+        tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2});                // several snps
+        tests.add(new Object[]{"AAAAA", "C", "1M4D1M", true, -1});                     // funky SW alignment
+        tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", true, 1});                     // very little data
+        tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1});                 // ends in mismatch
+        tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0});       // alignment is too complex
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "DanglingTails", enabled = !DEBUG)
+    public void testDanglingTails(final String refEnd,
+                                  final String altEnd,
+                                  final String cigar,
+                                  final boolean cigarIsGood,
+                                  final int mergePointDistanceFromSink) {
+
+        final int kmerSize = 15;
+
+        // construct the haplotypes
+        final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT";
+        final String ref = commonPrefix + refEnd;
+        final String alt = commonPrefix + altEnd;
+
+        // create the graph and populate it
+        final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize);
+        rtgraph.addSequence("ref", ref.getBytes(), null, true);
+        final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M");
+        rtgraph.addRead(read);
+        rtgraph.buildGraphIfNecessary();
+
+        // confirm that we have just a single dangling tail
+        MultiDeBruijnVertex altSink = null;
+        for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) {
+            if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) {
+                Assert.assertTrue(altSink == null, "We found more than one non-reference sink");
+                altSink = v;
+            }
+        }
+
+        Assert.assertTrue(altSink != null, "We did not find a non-reference sink");
+
+        // confirm that the SW alignment agrees with our expectations
+        final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink);
+        Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString());
+
+        // confirm that the goodness of the cigar agrees with our expectations
+        Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar), cigarIsGood);
+
+        // confirm that the tail merging works as expected
+        if ( cigarIsGood ) {
+            final int mergeResult = rtgraph.mergeDanglingTail(result);
+            Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1);
+
+            // confirm that we created the appropriate edge
+            if ( mergePointDistanceFromSink >= 0 ) {
+                MultiDeBruijnVertex v = altSink;
+                for ( int i = 0; i < mergePointDistanceFromSink; i++ ) {
+                    if ( rtgraph.inDegreeOf(v) != 1 )
+                        Assert.fail("Encountered vertex with multiple sources");
+                    v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v));
+                }
+                Assert.assertTrue(rtgraph.outDegreeOf(v) > 1);
+            }
+        }
+    }
+

 // TODO -- update to use determineKmerSizeAndNonUniques directly
 //    @DataProvider(name = "KmerSizeData")
--- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
@ -800,6 +800,23 @@ public final class AlignmentUtils {
        return new Cigar(elements);
    }

+    /**
+     * Removing a trailing deletion from the incoming cigar if present
+     *
+     * @param c the cigar we want to update
+     * @return a non-null Cigar
+     */
+    @Requires("c != null")
+    @Ensures("result != null")
+    public static Cigar removeTrailingDeletions(final Cigar c) {
+
+        final List<CigarElement> elements = c.getCigarElements();
+        if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.D )
+            return c;
+
+        return new Cigar(elements.subList(0, elements.size() - 1));
+    }
+
    /**
     * Move the indel in a given cigar string one base to the left
     *
--- a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java
+++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java
@ -118,6 +118,21 @@ public class SWPairwiseAlignment implements SmithWaterman {
        align(seq1,seq2);
    }

+    /**
+     * Create a new SW pairwise aligner
+     *
+     * After creating the object the two sequences are aligned with an internal call to align(seq1, seq2)
+     *
+     * @param seq1 the first sequence we want to align
+     * @param seq2 the second sequence we want to align
+     * @param strategy   the overhang strategy to use
+     */
+    public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final OVERHANG_STRATEGY strategy) {
+        this(SWParameterSet.ORIGINAL_DEFAULT.parameters);
+        overhang_strategy = strategy;
+        align(seq1, seq2);
+    }
+
    /**
     * Create a new SW pairwise aligner, without actually doing any alignment yet
     *
--- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java
@ -1033,5 +1033,12 @@ public class AlignmentUtilsUnitTest {
        Assert.assertEquals(AlignmentUtils.startsOrEndsWithInsertionOrDeletion(TextCigarCodec.getSingleton().decode(cigar)), expected);
    }

+    @Test(dataProvider = "StartsOrEndsWithInsertionOrDeletionData", enabled = true)
+    public void testRemoveTrailingDeletions(final String cigar, final boolean expected) {

+        final Cigar originalCigar = TextCigarCodec.getSingleton().decode(cigar);
+        final Cigar newCigar = AlignmentUtils.removeTrailingDeletions(originalCigar);
+
+        Assert.assertEquals(originalCigar.equals(newCigar), !cigar.endsWith("D"));
+    }
 }