From e4e7d39e2c8e9cb6a21f5152f46e20d334f81df0 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Thu, 23 May 2013 12:02:19 -0400
Subject: [PATCH 1/6] Fix FN problem stemming from sequence graphs that contain
 cycles.

Problem:
The sequence graphs can get very complex and it's not enough just to test that any given read has non-unique kmers.
Reads with variants can have kmers that match unique regions of the reference, and this causes cycles in the final
sequence graph.  Ultimately the problem is that kmers of 10/25 may not be large enough for these complex regions.

Solution:
We continue to try kmers of 10/25 but detect whether cycles exist; if so, we do not use them.  If (and only if) we
can't get usable graphs from the 10/25 kmers, then we start iterating over larger kmers until we either can generate
a graph without cycles or attempt too many iterations.
---
 .../haplotypecaller/HaplotypeCaller.java      |   6 +-
 .../readthreading/ReadThreadingAssembler.java | 145 +++++++++++-------
 .../readthreading/ReadThreadingGraph.java     |  18 ++-
 .../ReadThreadingGraphUnitTest.java           |  35 +++++
 4 files changed, 147 insertions(+), 57 deletions(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index e55413649..a41b68e2c 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -266,6 +266,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
     @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
     protected List<Integer> kmerSizes = Arrays.asList(10, 25);
 
+    @Advanced
+    @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false)
+    protected boolean dontIncreaseKmerSizesForCycles = false;
+
     /**
      * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes.  Each haplotype
      * considered requires N PairHMM evaluations if there are N reads across all samples.  In order to control the
@@ -520,7 +524,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
         final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
         assemblyEngine = useDebruijnAssembler
                 ? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
-                : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes);
+                : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles);
 
         assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
         assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
index 123b36640..0887929ab 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
@@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
+import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.haplotype.Haplotype;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 
@@ -63,11 +64,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
 
     private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
     private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
+    private final static int KMER_SIZE_ITERATION_INCREASE = 10;
+    private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6;
 
     /** The min and max kmer sizes to try when building the graph. */
     private final List<Integer> kmerSizes;
     private final int maxAllowedPathsForReadThreadingAssembler;
 
+    private final boolean dontIncreaseKmerSizesForCycles;
     private boolean requireReasonableNumberOfPaths = false;
     protected boolean removePathsNotConnectedToRef = true;
     private boolean justReturnRawGraph = false;
@@ -77,10 +81,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
         this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25));
     }
 
-    public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
+    public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes, final boolean dontIncreaseKmerSizesForCycles) {
         super(maxAllowedPathsForReadThreadingAssembler);
         this.kmerSizes = kmerSizes;
         this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler;
+        this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles;
+    }
+
+    public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
+        this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, false);
     }
 
     /** for testing purposes */
@@ -89,67 +98,99 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
     }
 
     @Override
-    public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
+    public List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes) {
         final List<SeqGraph> graphs = new LinkedList<>();
 
+        // first, try using the requested kmer sizes
         for ( final int kmerSize : kmerSizes ) {
-            final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
+            final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
+            if ( graph != null )
+                graphs.add(graph);
+        }
 
-            // add the reference sequence to the graph
-            rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
-
-            // add the artificial GGA haplotypes to the graph
-            int hapCount = 0;
-            for( final Haplotype h : activeAlleleHaplotypes ) {
-                final int[] counts = new int[h.length()];
-                Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
-                rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
-            }
-
-            // Next pull kmers out of every read and throw them on the graph
-            for( final GATKSAMRecord read : reads ) {
-                rtgraph.addRead(read);
-            }
-
-            // actually build the read threading graph
-            rtgraph.buildGraphIfNecessary();
-            printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
-
-            // go through and prune all of the chains where all edges have <= pruneFactor.  This must occur
-            // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
-            // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
-            rtgraph.pruneLowWeightChains(pruneFactor);
-
-            // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
-            // we can recover them by merging some N bases from the chain back into the reference uniquely, for
-            // N < kmerSize
-            if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
-
-            // remove all heading and trailing paths
-            if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
-
-            printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
-
-            final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
-
-            // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
-            if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph);
-
-            if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
-            printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
-            initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
-
-            final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
-            if ( seqGraph != null ) {
-                if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) {
-                    graphs.add(seqGraph);
-                }
+        // if none of those worked, iterate over larger sizes if allowed to do so
+        if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) {
+            int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE;
+            int numIterations = 1;
+            while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
+                final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
+                if ( graph != null )
+                    graphs.add(graph);
+                kmerSize += KMER_SIZE_ITERATION_INCREASE;
+                numIterations++;
             }
         }
 
         return graphs;
     }
 
+    /**
+     * Creates the sequence graph for the given kmerSize
+     *
+     * @param reads            reads to use
+     * @param refHaplotype     reference haplotype
+     * @param kmerSize         kmer size
+     * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
+     * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths)
+     */
+    protected SeqGraph createGraph(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final int kmerSize, final List<Haplotype> activeAlleleHaplotypes) {
+        final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
+
+        // add the reference sequence to the graph
+        rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
+
+        // add the artificial GGA haplotypes to the graph
+        int hapCount = 0;
+        for ( final Haplotype h : activeAlleleHaplotypes ) {
+            final int[] counts = new int[h.length()];
+            Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
+            rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
+        }
+
+        // Next pull kmers out of every read and throw them on the graph
+        for( final GATKSAMRecord read : reads ) {
+            rtgraph.addRead(read);
+        }
+
+        // actually build the read threading graph
+        rtgraph.buildGraphIfNecessary();
+
+        // sanity check: make sure there are no cycles in the graph
+        if ( rtgraph.hasCycles() ) {
+            if ( debug ) logger.info("Not using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler because it contains a cycle");
+            return null;
+        }
+
+        printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
+
+        // go through and prune all of the chains where all edges have <= pruneFactor.  This must occur
+        // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
+        // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
+        rtgraph.pruneLowWeightChains(pruneFactor);
+
+        // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
+        // we can recover them by merging some N bases from the chain back into the reference uniquely, for
+        // N < kmerSize
+        if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
+
+        // remove all heading and trailing paths
+        if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
+
+        printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
+
+        final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
+
+        // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
+        if ( justReturnRawGraph ) return initialSeqGraph;
+
+        if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
+        printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
+        initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
+
+        final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
+        return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph;
+    }
+
     /**
      * Did we find a reasonable number of paths in this graph?
      * @param graph
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
index 8e879377f..bbc1618ac 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
@@ -54,6 +54,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.collections.PrimitivePair;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.jgrapht.EdgeFactory;
+import org.jgrapht.alg.CycleDetector;
 
 import java.io.File;
 import java.util.*;
@@ -297,7 +298,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
     public void buildGraphIfNecessary() {
         if ( alreadyBuilt ) return;
 
-        // determine the kmer size we'll uses, and capture the set of nonUniques for that kmer size
+        // determine the kmer size we'll use, and capture the set of nonUniques for that kmer size
         final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize);
         nonUniqueKmers = result.nonUniques;
 
@@ -321,6 +322,13 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
         alreadyBuilt = true;
     }
 
+    /**
+     * @return true if the graph has cycles, false otherwise
+     */
+    public boolean hasCycles() {
+        return new CycleDetector<>(this).detectCycles();
+    }
+
     public void recoverDanglingTails() {
         if ( ! alreadyBuilt )  throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
 
@@ -409,7 +417,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
     private Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) {
         // count up occurrences of kmers within each read
         final KMerCounter counter = new KMerCounter(kmerSize);
-        for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) {
+        final int stopPosition = seqForKmers.stop - kmerSize;
+        for ( int i = 0; i <= stopPosition; i++ ) {
             final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
             counter.addKmer(kmer, 1);
         }
@@ -578,7 +587,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
 
         // none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
         final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize);
-        MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
+        final MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
 
         if ( isRef && uniqueMergeVertex != null )
             throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex);
@@ -590,7 +599,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
     }
 
     /**
-     * Get the longest stretch of high quality bases in read and pass that sequence to the graph
+     * Add the given read to the sequence graph.  Ultimately the read will get sent through addSequence(), but first
+     * this method ensures we only use high quality bases and accounts for reduced reads, etc.
      *
      * @param read a non-null read
      */
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
index 10c1cc00d..340777513 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
@@ -49,6 +49,11 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
+import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.testng.Assert;
 import org.testng.annotations.Test;
 
@@ -145,6 +150,36 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
         }
     }
 
+    @Test(enabled = !DEBUG)
+    public void testCyclesInGraph() {
+
+        // b37 20:12655200-12655850
+        final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
+
+        // SNP at 20:12655528 creates a cycle for small kmers
+        final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
+
+        final List<GATKSAMRecord> reads = new ArrayList<>();
+        for ( int index = 0; index < alt.length() - 100; index += 20 )
+            reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M"));
+
+        // test that there are cycles detected for small kmer
+        final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25);
+        rtgraph25.addSequence("ref", ref.getBytes(), null, true);
+        for ( final GATKSAMRecord read : reads )
+            rtgraph25.addRead(read);
+        rtgraph25.buildGraphIfNecessary();
+        Assert.assertTrue(rtgraph25.hasCycles());
+
+        // test that there are no cycles detected for large kmer
+        final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75);
+        rtgraph75.addSequence("ref", ref.getBytes(), null, true);
+        for ( final GATKSAMRecord read : reads )
+            rtgraph75.addRead(read);
+        rtgraph75.buildGraphIfNecessary();
+        Assert.assertFalse(rtgraph75.hasCycles());
+    }
+
     // TODO -- update to use determineKmerSizeAndNonUniques directly
 //    @DataProvider(name = "KmerSizeData")
 //    public Object[][] makeKmerSizeDataProvider() {

From 77868d034f4e006b8e33d2e9bf39447b88790ba7 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Thu, 30 May 2013 14:00:43 -0400
Subject: [PATCH 2/6] Do not allow the use of Ns in reads for graph
 construction.

Ns are treated as wildcards in the PairHMM so creating haplotypes with Ns gives them artificial advantages over other ones.
This was the cause of at least one FN where there were Ns at a SNP position.
---
 .../readthreading/ReadThreadingGraph.java     | 15 +++++++++-
 .../LocalAssemblyEngineUnitTest.java          |  2 +-
 .../ReadThreadingGraphUnitTest.java           | 30 ++++++++++++++++---
 3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
index bbc1618ac..ab6b17c35 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
@@ -50,6 +50,7 @@ import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
+import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.collections.PrimitivePair;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@@ -611,7 +612,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
 
         int lastGood = -1; // the index of the last good base we've seen
         for( int end = 0; end <= sequence.length; end++ ) {
-            if ( end == sequence.length || qualities[end] < minBaseQualityToUseInAssembly ) {
+            if ( end == sequence.length || ! baseIsUsableForAssembly(sequence[end], qualities[end]) ) {
                 // the first good base is at lastGood, can be -1 if last base was bad
                 final int start = lastGood;
                 // the stop base is end - 1 (if we're not at the end of the sequence)
@@ -631,6 +632,18 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
         }
     }
 
+    /**
+     * Determines whether a base can safely be used for assembly.
+     * Currently disallows Ns and/or those with low quality
+     *
+     * @param base  the base under consideration
+     * @param qual  the quality of that base
+     * @return true if the base can be used for assembly, false otherwise
+     */
+    protected boolean baseIsUsableForAssembly(final byte base, final byte qual) {
+        return base != BaseUtils.Base.N.base && qual >= minBaseQualityToUseInAssembly;
+    }
+
     /**
      * Get the set of non-unique kmers in this graph.  For debugging purposes
      * @return a non-null set of kmers
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java
index 74361de1b..a74ce1c75 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java
@@ -251,7 +251,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
             for ( int snpPos = 0; snpPos < windowSize; snpPos++) {
                 if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) {
                     final byte[] altBases = ref.getBytes();
-                    altBases[snpPos] = 'N';
+                    altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A';
                     final String alt = new String(altBases);
                     tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt});
                 }
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
index 340777513..67ee52734 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
@@ -48,10 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
 
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
-import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
-import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
 import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.haplotype.Haplotype;
 import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.testng.Assert;
@@ -180,7 +178,31 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
         Assert.assertFalse(rtgraph75.hasCycles());
     }
 
-    // TODO -- update to use determineKmerSizeAndNonUniques directly
+    @Test(enabled = !DEBUG)
+    public void testNsInReadsAreNotUsedForGraph() {
+
+        final int length = 100;
+        final byte[] ref = Utils.dupBytes((byte)'A', length);
+
+        final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25);
+        rtgraph.addSequence("ref", ref, null, true);
+
+        // add reads with Ns at any position
+        for ( int i = 0; i < length; i++ ) {
+            final byte[] bases = ref.clone();
+            bases[i] = 'N';
+            final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M");
+            rtgraph.addRead(read);
+        }
+        rtgraph.buildGraphIfNecessary();
+
+        final SeqGraph graph = rtgraph.convertToSequenceGraph();
+        final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<>(false);
+        Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1);
+    }
+
+
+// TODO -- update to use determineKmerSizeAndNonUniques directly
 //    @DataProvider(name = "KmerSizeData")
 //    public Object[][] makeKmerSizeDataProvider() {
 //        List<Object[]> tests = new ArrayList<Object[]>();

From c0e3874db095836e389d4ff7c277fb778a54c7b8 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Mon, 3 Jun 2013 14:34:29 -0400
Subject: [PATCH 3/6] Change the HC's phredScaledGlobalReadMismappingRate from
 60 to 45, because Ryan and Mark told me to.

---
 .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index a41b68e2c..24fd5901f 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -336,7 +336,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
      */
     @Advanced
     @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false)
-    protected int phredScaledGlobalReadMismappingRate = 60;
+    protected int phredScaledGlobalReadMismappingRate = 45;
 
     @Advanced
     @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)

From c0030f3f2dd36d29d70b1fa06daf2e399e99169a Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 4 Jun 2013 09:26:50 -0400
Subject: [PATCH 4/6] We no longer subset down to the best N haplotypes for the
 GL calculation.

I explain in comments within the code that this was causing problems with the marginalization over events.
---
 .../haplotypecaller/HaplotypeCaller.java      | 26 +++++--------------
 .../haplotypecaller/LocalAssemblyEngine.java  |  2 +-
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 24fd5901f..3e411ae33 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -694,11 +694,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
         //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
         final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) );
 
-        // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
-        final List<Haplotype> bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap);
+        // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there
+        //  was a bad interaction between that selection and the marginalization that happens over each event when computing
+        //  GLs.  In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the
+        //  haplotype containing C as reference (and vice versa).  Now this is fine if all possible haplotypes are included
+        //  in the genotyping, but we lose information if we select down to a few haplotypes.  [EB]
 
         final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine,
-                bestHaplotypes,
+                assemblyResult.haplotypes,
                 stratifiedReadMap,
                 perSampleFilteredReadList,
                 assemblyResult.fullReferenceWithPadding,
@@ -711,7 +714,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
         // TODO -- must disable if we are doing NCT, or set the output type of ! presorted
         if ( bamWriter != null ) {
             haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
-                    bestHaplotypes,
+                    assemblyResult.haplotypes,
                     calledHaplotypes.getCalledHaplotypes(),
                     stratifiedReadMap);
         }
@@ -863,21 +866,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
         return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
     }
 
-    /**
-     * Select the best N haplotypes according to their likelihoods, if appropriate
-     *
-     * @param haplotypes a list of haplotypes to consider
-     * @param stratifiedReadMap a map from samples -> read likelihoods
-     * @return the list of haplotypes to genotype
-     */
-    protected List<Haplotype> selectBestHaplotypesForGenotyping(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
-        if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
-            return haplotypes;
-        } else {
-            return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
-        }
-    }
-
     //---------------------------------------------------------------------------------------------------------------
     //
     // reduce
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
index 1a5f34bc3..2a74e9dd0 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
@@ -215,7 +215,7 @@ public abstract class LocalAssemblyEngine {
                     returnHaplotypes.add(h);
 
                     if ( debug )
-                        logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
+                        logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize());
                 }
             }
         }

From dadcfe296dff64dae226aad1da57da2c512c3870 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 5 Jun 2013 14:26:23 -0400
Subject: [PATCH 5/6] Reworking of the dangling tails merging code.

We now run Smith-Waterman on the dangling tail against the corresponding reference tail.
If we can generate a reasonable, low entropy alignment then we trigger the merge to the
reference path; otherwise we abort.  Also, we put in a check for low-complexity of graphs
and don't let those pass through.

Added tests for this implementation that checks exact SW results and correct edges added.
---
 .../haplotypecaller/graphs/BaseGraph.java     |  18 ++
 .../haplotypecaller/graphs/GraphUtils.java    |  10 +-
 .../walkers/haplotypecaller/graphs/Path.java  |   3 +-
 .../readthreading/ReadThreadingAssembler.java |  28 ++-
 .../readthreading/ReadThreadingGraph.java     | 207 ++++++++++++++----
 .../graphs/BaseGraphUnitTest.java             |  15 ++
 .../ReadThreadingAssemblerUnitTest.java       |   3 +-
 .../ReadThreadingGraphUnitTest.java           |  76 +++++++
 .../sting/utils/sam/AlignmentUtils.java       |  17 ++
 .../smithwaterman/SWPairwiseAlignment.java    |  15 ++
 .../utils/sam/AlignmentUtilsUnitTest.java     |   7 +
 11 files changed, 339 insertions(+), 60 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
index c963fb6e5..70ef539f3 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
@@ -676,6 +676,24 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
                 '}';
     }
 
+    /**
+     * The base sequence for the given path.
+     * Note, this assumes that the path does not start with a source node.
+     *
+     * @param path the list of vertexes that make up the path
+     * @return  non-null sequence of bases corresponding to the given path
+     */
+    @Ensures({"result != null"})
+    public byte[] getBasesForPath(final List<? extends DeBruijnVertex> path) {
+        if ( path == null ) throw new IllegalArgumentException("Path cannot be null");
+
+        final StringBuffer sb = new StringBuffer();
+        for ( final DeBruijnVertex v : path )
+            sb.append((char)v.getSuffix());
+
+        return sb.toString().getBytes();
+    }
+
     /**
      * Get the set of vertices within distance edges of source, regardless of edge direction
      *
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java
index 4aa6047a9..73a1daa3e 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java
@@ -171,7 +171,15 @@ final public class GraphUtils {
         return foundDup ? null : new PrimitivePair.Int(longestPos, length);
     }
 
-    private static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
+    /**
+     * calculates the longest suffix match between a sequence and a smaller kmer
+     *
+     * @param seq         the (reference) sequence
+     * @param kmer        the smaller kmer sequence
+     * @param seqStart    the index (inclusive) on seq to start looking backwards from
+     * @return the longest matching suffix
+     */
+    public static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
         for ( int len = 1; len <= kmer.length; len++ ) {
             final int seqI = seqStart - len + 1;
             final int kmerI = kmer.length - len;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java
index a07b98bb6..2e84e1d22 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java
@@ -47,7 +47,6 @@
 package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
 
 import com.google.java.contract.Ensures;
-import com.google.java.contract.Requires;
 import net.sf.samtools.Cigar;
 import net.sf.samtools.CigarElement;
 import net.sf.samtools.CigarOperator;
@@ -92,7 +91,7 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
     /**
      * Create a new Path containing no edges and starting at initialVertex
      * @param initialVertex the starting vertex of the path
-     * @param graph the graph this path with follow through
+     * @param graph the graph this path will follow through
      */
     public Path(final T initialVertex, final BaseGraph<T, E> graph) {
         if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null");
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
index 0887929ab..f4290f2bb 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
@@ -55,7 +55,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 
 import java.io.File;
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;
 
@@ -89,7 +88,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
     }
 
     public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
-        this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, false);
+        this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true);
     }
 
     /** for testing purposes */
@@ -103,7 +102,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
 
         // first, try using the requested kmer sizes
         for ( final int kmerSize : kmerSizes ) {
-            final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
+            final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles);
             if ( graph != null )
                 graphs.add(graph);
         }
@@ -113,7 +112,8 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
             int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE;
             int numIterations = 1;
             while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
-                final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes);
+                // on the last attempt we will allow low complexity graphs
+                final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT);
                 if ( graph != null )
                     graphs.add(graph);
                 kmerSize += KMER_SIZE_ITERATION_INCREASE;
@@ -131,9 +131,14 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
      * @param refHaplotype     reference haplotype
      * @param kmerSize         kmer size
      * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
-     * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths)
+     * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
+     * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
      */
-    protected SeqGraph createGraph(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final int kmerSize, final List<Haplotype> activeAlleleHaplotypes) {
+    protected SeqGraph createGraph(final List<GATKSAMRecord> reads,
+                                   final Haplotype refHaplotype,
+                                   final int kmerSize,
+                                   final List<Haplotype> activeAlleleHaplotypes,
+                                   final boolean allowLowComplexityGraphs) {
         final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
 
         // add the reference sequence to the graph
@@ -157,7 +162,13 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
 
         // sanity check: make sure there are no cycles in the graph
         if ( rtgraph.hasCycles() ) {
-            if ( debug ) logger.info("Not using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler because it contains a cycle");
+            if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle");
+            return null;
+        }
+
+        // sanity check: make sure the graph had enough complexity with the given kmer
+        if ( ! allowLowComplexityGraphs && rtgraph.isLowComplexity() ) {
+            if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity");
             return null;
         }
 
@@ -169,8 +180,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
         rtgraph.pruneLowWeightChains(pruneFactor);
 
         // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
-        // we can recover them by merging some N bases from the chain back into the reference uniquely, for
-        // N < kmerSize
+        // we can recover them by merging some N bases from the chain back into the reference
         if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
 
         // remove all heading and trailing paths
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
index ab6b17c35..8d8cb83f6 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
@@ -46,14 +46,19 @@
 
 package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
 
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
 import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.collections.Pair;
-import org.broadinstitute.sting.utils.collections.PrimitivePair;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
+import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman;
 import org.jgrapht.EdgeFactory;
 import org.jgrapht.alg.CycleDetector;
 
@@ -80,9 +85,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
     /** for debugging info printing */
     private static int counter = 0;
 
-    /** we require at least this many bases to be uniquely matching to merge a dangling tail */
-    private final static int MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL = 5;
-
     /**
      * Sequences added for read threading before we've actually built the graph
      */
@@ -96,7 +98,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
     /**
      * A map from kmers -> their corresponding vertex in the graph
      */
-    private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<Kmer, MultiDeBruijnVertex>();
+    private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<>();
 
     /**
      *
@@ -113,8 +115,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
     // --------------------------------------------------------------------------------
     private Kmer refSource;
     private boolean alreadyBuilt;
-    byte[] refSeq;
-    MultiDeBruijnVertex[] refKmers;
 
     public ReadThreadingGraph() {
         this(25, false, (byte)6);
@@ -148,8 +148,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
         uniqueKmers.clear();
         refSource = null;
         alreadyBuilt = false;
-        refSeq = null;
-        refKmers = null;
     }
 
     /**
@@ -226,13 +224,10 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
 
         if ( debugGraphTransformations ) startingVertex.addRead(seqForKmers.name);
 
-        // keep track of information about the reference kmers for merging dangling tails
+        // keep track of information about the reference source
         if ( seqForKmers.isRef ) {
-            if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev " + refSource + " new is " + startingVertex);
+            if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev: " + refSource + ", new: " + startingVertex);
             refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize);
-            refSeq = seqForKmers.sequence;
-            refKmers = new MultiDeBruijnVertex[refSeq.length];
-            for ( int i = 0; i < kmerSize; i++ ) refKmers[i] = null;
         }
 
         // loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate
@@ -242,54 +237,161 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
 
             vertex = extendChainByOne(vertex, seqForKmers.sequence, i, count, seqForKmers.isRef);
             if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name);
-
-            // keep track of the reference kmers for merging dangling tails
-            if ( seqForKmers.isRef ) refKmers[i + kmerSize - 1] = vertex;
         }
     }
 
     /**
-     * Attempt to attach vertex with out-degree == 0 to the graph by finding a unique matching kmer to the reference
+     * Class to keep track of the important dangling tail merging data
+     */
+    protected final class DanglingTailMergeResult {
+        final List<MultiDeBruijnVertex> danglingPath, referencePath;
+        final byte[] danglingPathString, referencePathString;
+        final Cigar cigar;
+
+        public DanglingTailMergeResult(final List<MultiDeBruijnVertex> danglingPath,
+                                       final List<MultiDeBruijnVertex> referencePath,
+                                       final byte[] danglingPathString,
+                                       final byte[] referencePathString,
+                                       final Cigar cigar) {
+            this.danglingPath = danglingPath;
+            this.referencePath = referencePath;
+            this.danglingPathString = danglingPathString;
+            this.referencePathString = referencePathString;
+            this.cigar = cigar;
+        }
+    }
+
+    /**
+     * Attempt to attach vertex with out-degree == 0 to the graph
+     *
      * @param vertex the vertex to recover
+     * @return 1 if we successfully recovered the vertex and 0 otherwise
      */
     protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) {
         if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0");
 
-        final byte[] kmer = vertex.getSequence();
-        if ( ! nonUniqueKmers.contains(new Kmer(kmer)) ) {
-            // don't attempt to fix non-unique kmers!
-            final MultiDeBruijnVertex uniqueMergePoint = danglingTailMergePoint(kmer);
-            if ( uniqueMergePoint != null ) {
-                addEdge(vertex, uniqueMergePoint, new MultiSampleEdge(false, 1));
-                return 1;
-            }
-        }
+        // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
+        final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex);
 
-        return 0;
+        // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path
+        if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) )
+            return 0;
+
+        // merge
+        return mergeDanglingTail(danglingTailMergeResult);
     }
 
     /**
-     * Find a unique merge point for kmer in the reference sequence
-     * @param kmer the full kmer of the dangling tail
-     * @return a vertex appropriate to merge kmer into, or null if none could be found
+     * Determine whether the provided cigar is okay to merge into the reference path
+     *
+     * @param cigar    the cigar to analyze
+     * @return true if it's okay to merge, false otherwise
      */
-    private MultiDeBruijnVertex danglingTailMergePoint(final byte[] kmer) {
-        final PrimitivePair.Int endAndLength = GraphUtils.findLongestUniqueSuffixMatch(refSeq, kmer);
-        if ( endAndLength != null && endAndLength.second >= MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL && endAndLength.first + 1 < refKmers.length) {
-            final int len = endAndLength.second;
-            final MultiDeBruijnVertex mergePoint = refKmers[endAndLength.first + 1];
-//            logger.info("recoverDanglingChain of kmer " + new String(kmer) + " merged to " + mergePoint + " with match size " + len);
-            final Set<Kmer> nonUniquesAtLength = determineKmerSizeAndNonUniques(len, len).nonUniques;
-            final Kmer matchedKmer = new Kmer(kmer, kmer.length - len, len);
-            if ( nonUniquesAtLength.contains(matchedKmer) ) {
-//                logger.info("Rejecting merge " + new String(kmer) + " because match kmer " + matchedKmer + " isn't unique across all reads");
-                return null;
-            } else {
-                return mergePoint;
-            }
+    protected boolean cigarIsOkayToMerge(final Cigar cigar) {
+
+        final List<CigarElement> elements = cigar.getCigarElements();
+
+        // don't allow more than a couple of different ops
+        if ( elements.size() > 3 )
+            return false;
+
+        // the last element must be an M
+        if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M )
+            return false;
+
+        // TODO -- do we want to check whether the Ms mismatch too much also?
+
+        return true;
+    }
+
+    /**
+     * Actually merge the dangling tail if possible
+     *
+     * @param danglingTailMergeResult   the result from generating a Cigar for the dangling tail against the reference
+     * @return 1 if merge was successful, 0 otherwise
+     */
+    protected int mergeDanglingTail(final DanglingTailMergeResult danglingTailMergeResult) {
+
+        final List<CigarElement> elements = danglingTailMergeResult.cigar.getCigarElements();
+        final CigarElement lastElement = elements.get(elements.size() - 1);
+        if ( lastElement.getOperator() != CigarOperator.M )
+            throw new IllegalArgumentException("The last Cigar element must be an M");
+
+        final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1;
+        final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength());
+        if ( matchingSuffix == 0 )
+            return 0;
+
+        final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0);
+        final int refIndexToMerge = lastRefIndex - matchingSuffix + 1;
+        addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), new MultiSampleEdge(false, 1));
+        return 1;
+    }
+
+    /**
+     * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
+     * provided vertex is the sink) and the reference path.
+     *
+     * @param vertex   the sink of the dangling tail
+     * @return a SmithWaterman object which can be null if no proper alignment could be generated
+     */
+    protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex) {
+
+        // find the lowest common ancestor path between vertex and the reference sink if available
+        final List<MultiDeBruijnVertex> altPath = findPathToLowestCommonAncestorOfReference(vertex);
+        if ( altPath == null )
+            return null;
+
+        // now get the reference path from the LCA
+        final List<MultiDeBruijnVertex> refPath = getReferencePath(altPath.get(0));
+
+        // create the Smith-Waterman strings to use
+        final byte[] refBases = getBasesForPath(refPath);
+        final byte[] altBases = getBasesForPath(altPath);
+
+        // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting)
+        final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWPairwiseAlignment.OVERHANG_STRATEGY.INDEL);
+        return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar()));
+    }
+
+    /**
+     * Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex
+     *
+     * @param vertex   the original vertex
+     * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
+     *  has an ancestor with multiple incoming edges before hitting the reference path
+     */
+    protected List<MultiDeBruijnVertex> findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) {
+        final LinkedList<MultiDeBruijnVertex> path = new LinkedList<>();
+
+        MultiDeBruijnVertex v = vertex;
+        while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) {
+            path.addFirst(v);
+            v = getEdgeSource(incomingEdgeOf(v));
+        }
+        path.addFirst(v);
+
+        return isReferenceNode(v) ? path : null;
+    }
+
+    /**
+     * Finds the path downwards in the graph from this vertex to the reference sink, including this vertex
+     *
+     * @param start   the reference vertex to start from
+     * @return the path (non-null, non-empty)
+     */
+    protected List<MultiDeBruijnVertex> getReferencePath(final MultiDeBruijnVertex start) {
+        if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path");
+
+        final List<MultiDeBruijnVertex> path = new ArrayList<>();
+
+        MultiDeBruijnVertex v = start;
+        while ( v != null ) {
+            path.add(v);
+            v = getNextReferenceVertex(v);
         }
 
-        return null;
+        return path;
     }
 
     /**
@@ -330,6 +432,16 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
         return new CycleDetector<>(this).detectCycles();
     }
 
+    /**
+     * Does the graph not have enough complexity?  We define low complexity as a situation where the number
+     * of non-unique kmers is more than 20% of the total number of kmers.
+     *
+     * @return true if the graph has low complexity, false otherwise
+     */
+    public boolean isLowComplexity() {
+        return nonUniqueKmers.size() * 4 > uniqueKmers.size();
+    }
+
     public void recoverDanglingTails() {
         if ( ! alreadyBuilt )  throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
 
@@ -341,7 +453,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
                 nRecovered += recoverDanglingChain(v);
             }
         }
-        //logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails");
+
+        if ( debugGraphTransformations ) logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails");
     }
 
     /** structure that keeps track of the non-unique kmers for a given kmer size */
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java
index e57f5d6e0..f9cbc6c73 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java
@@ -312,4 +312,19 @@ public class BaseGraphUnitTest extends BaseTest {
 
         Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
     }
+
+    @Test(enabled = true)
+    public void testGetBases() {
+
+        final int kmerSize = 4;
+        final String testString = "AATGGGGGCAATACTA";
+
+        final List<DeBruijnVertex> vertexes = new ArrayList<>();
+        for ( int i = 0; i <= testString.length() - kmerSize; i++ ) {
+            vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize)));
+        }
+
+        final String result = new String(new DeBruijnGraph().getBasesForPath(vertexes));
+        Assert.assertEquals(result, testString.substring(kmerSize - 1));
+    }
 }
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java
index 3f10fc72c..8269b9c20 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java
@@ -83,7 +83,8 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest {
         }
 
         public SeqGraph assemble() {
-            assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests
+            assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests
+            assembler.setRecoverDanglingTails(false); // needed to pass some of the tests
             assembler.setDebugGraphTransformations(true);
             final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.<Haplotype>emptyList()).get(0);
             if ( DEBUG ) graph.printGraph(new File("test.dot"), 0);
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
index 67ee52734..ed91cccb3 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java
@@ -53,6 +53,7 @@ import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.testng.Assert;
+import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
 import java.io.File;
@@ -201,6 +202,81 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
         Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1);
     }
 
+    @DataProvider(name = "DanglingTails")
+    public Object[][] makeDanglingTailsData() {
+        List<Object[]> tests = new ArrayList<Object[]>();
+
+        // add 1M to the expected CIGAR because it includes the previous (common) base too
+        tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3});                  // incomplete haplotype
+        tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10});     // insertion
+        tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10});    // deletion
+        tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7});                // 1 snp
+        tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2});                // several snps
+        tests.add(new Object[]{"AAAAA", "C", "1M4D1M", true, -1});                     // funky SW alignment
+        tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", true, 1});                     // very little data
+        tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1});                 // ends in mismatch
+        tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0});       // alignment is too complex
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "DanglingTails", enabled = !DEBUG)
+    public void testDanglingTails(final String refEnd,
+                                  final String altEnd,
+                                  final String cigar,
+                                  final boolean cigarIsGood,
+                                  final int mergePointDistanceFromSink) {
+
+        final int kmerSize = 15;
+
+        // construct the haplotypes
+        final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT";
+        final String ref = commonPrefix + refEnd;
+        final String alt = commonPrefix + altEnd;
+
+        // create the graph and populate it
+        final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize);
+        rtgraph.addSequence("ref", ref.getBytes(), null, true);
+        final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M");
+        rtgraph.addRead(read);
+        rtgraph.buildGraphIfNecessary();
+
+        // confirm that we have just a single dangling tail
+        MultiDeBruijnVertex altSink = null;
+        for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) {
+            if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) {
+                Assert.assertTrue(altSink == null, "We found more than one non-reference sink");
+                altSink = v;
+            }
+        }
+
+        Assert.assertTrue(altSink != null, "We did not find a non-reference sink");
+
+        // confirm that the SW alignment agrees with our expectations
+        final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink);
+        Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString());
+
+        // confirm that the goodness of the cigar agrees with our expectations
+        Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar), cigarIsGood);
+
+        // confirm that the tail merging works as expected
+        if ( cigarIsGood ) {
+            final int mergeResult = rtgraph.mergeDanglingTail(result);
+            Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1);
+
+            // confirm that we created the appropriate edge
+            if ( mergePointDistanceFromSink >= 0 ) {
+                MultiDeBruijnVertex v = altSink;
+                for ( int i = 0; i < mergePointDistanceFromSink; i++ ) {
+                    if ( rtgraph.inDegreeOf(v) != 1 )
+                        Assert.fail("Encountered vertex with multiple sources");
+                    v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v));
+                }
+                Assert.assertTrue(rtgraph.outDegreeOf(v) > 1);
+            }
+        }
+    }
+
 
 // TODO -- update to use determineKmerSizeAndNonUniques directly
 //    @DataProvider(name = "KmerSizeData")
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
index fa35e3f53..762ce4858 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
@@ -800,6 +800,23 @@ public final class AlignmentUtils {
         return new Cigar(elements);
     }
 
+    /**
+     * Removing a trailing deletion from the incoming cigar if present
+     *
+     * @param c the cigar we want to update
+     * @return a non-null Cigar
+     */
+    @Requires("c != null")
+    @Ensures("result != null")
+    public static Cigar removeTrailingDeletions(final Cigar c) {
+
+        final List<CigarElement> elements = c.getCigarElements();
+        if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.D )
+            return c;
+
+        return new Cigar(elements.subList(0, elements.size() - 1));
+    }
+
     /**
      * Move the indel in a given cigar string one base to the left
      *
diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java
index 84c33d4a5..1abf9f836 100644
--- a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java
+++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java
@@ -118,6 +118,21 @@ public class SWPairwiseAlignment implements SmithWaterman {
         align(seq1,seq2);
     }
 
+    /**
+     * Create a new SW pairwise aligner
+     *
+     * After creating the object the two sequences are aligned with an internal call to align(seq1, seq2)
+     *
+     * @param seq1 the first sequence we want to align
+     * @param seq2 the second sequence we want to align
+     * @param strategy   the overhang strategy to use
+     */
+    public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final OVERHANG_STRATEGY strategy) {
+        this(SWParameterSet.ORIGINAL_DEFAULT.parameters);
+        overhang_strategy = strategy;
+        align(seq1, seq2);
+    }
+
     /**
      * Create a new SW pairwise aligner, without actually doing any alignment yet
      *
diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java
index e7d54c460..fbf0242a3 100644
--- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java
@@ -1033,5 +1033,12 @@ public class AlignmentUtilsUnitTest {
         Assert.assertEquals(AlignmentUtils.startsOrEndsWithInsertionOrDeletion(TextCigarCodec.getSingleton().decode(cigar)), expected);
     }
 
+    @Test(dataProvider = "StartsOrEndsWithInsertionOrDeletionData", enabled = true)
+    public void testRemoveTrailingDeletions(final String cigar, final boolean expected) {
 
+        final Cigar originalCigar = TextCigarCodec.getSingleton().decode(cigar);
+        final Cigar newCigar = AlignmentUtils.removeTrailingDeletions(originalCigar);
+
+        Assert.assertEquals(originalCigar.equals(newCigar), !cigar.endsWith("D"));
+    }
 }

From 2c3c680eb704348cb8e20572f5b9d2fb3a5a986c Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 5 Jun 2013 12:22:14 -0400
Subject: [PATCH 6/6] Misc changes and cleanup from all previous commits in
 this push.

1. By default, do not include the UG CEU callset for assessment.
2. Updated md5s that are different now with all the HC changes.
---
 ...lexAndSymbolicVariantsIntegrationTest.java |  6 +++---
 .../HaplotypeCallerIntegrationTest.java       | 20 +++++++++----------
 ...aplotypeCallerParallelIntegrationTest.java |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
index fba294c3d..073d54ec5 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
@@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
 
     @Test
     public void testHaplotypeCallerMultiSampleComplex1() {
-        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "8d7728909b1b8eb3f30f2f1583f054a8");
+        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d21f15a5809fe5259af41ae6774af6f1");
     }
 
     private void HCTestSymbolicVariants(String bam, String args, String md5) {
@@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
     @Test
     public void testHaplotypeCallerMultiSampleGGAComplex() {
         HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
-                "db71826dc798ff1cdf0c5d05b0ede976");
+                "d4a0797c2fd4c103bf9a137633376156");
     }
 
     @Test
     public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
         HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
-                "42831d5463552911b7da9de0b4a27289");
+                "a9872228d0275a30f5a1f7e070a9c9f4");
     }
 }
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 904f15728..dbdd0afcd 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -78,12 +78,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSample() {
-        HCTest(CEUTRIO_BAM, "", "1b15e4647013ab2c3ce7073c420d8640");
+        HCTest(CEUTRIO_BAM, "", "e9167a1bfc0fc276586788d1ce1be408");
     }
 
     @Test
     public void testHaplotypeCallerSingleSample() {
-        HCTest(NA12878_BAM, "", "423be27dc2cf7fd10baf465cf93e18e2");
+        HCTest(NA12878_BAM, "", "b1d46afb9659ac3b92a3d131b58924ef");
     }
 
     @Test(enabled = false) // can't annotate the rsID's yet
@@ -94,7 +94,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void testHaplotypeCallerMultiSampleGGA() {
         HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
-                "a28e6f14e28708283d61c1e423bbdcb1");
+                "d83856b8136776bd731a8037c16b71fa");
     }
 
     @Test
@@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerSingleSampleIndelQualityScores() {
-        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "8344d86751b707c53b296c297eba4bfa");
+        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "70c4476816f5d35c9978c378dbeac09b");
     }
 
     private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@@ -147,7 +147,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerNearbySmallIntervals() {
-        HCTestNearbySmallIntervals(NA12878_BAM, "", "dea98f257d39fa1447a12c36a6bbf4a3");
+        HCTestNearbySmallIntervals(NA12878_BAM, "", "947aae309ecab7cd3f17ff9810884924");
     }
 
     // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@@ -157,14 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void HCTestProblematicReadsModifiedInActiveRegions() {
         final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
-        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("7cd1c5e2642ae8ddf38932aba1f51d69"));
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
         executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
     }
 
     @Test
     public void HCTestStructuralIndels() {
         final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
-        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ee55ff4c6ec1bbef88e21cc0f45d4c47"));
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320"));
         executeTest("HCTestStructuralIndels: ", spec);
     }
 
@@ -186,7 +186,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     public void HCTestReducedBam() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
-                Arrays.asList("4886a98bf699f4e7f4491160749ada6a"));
+                Arrays.asList("0124c4923d96ec0f8222b596dd4ef534"));
         executeTest("HC calling on a ReducedRead BAM", spec);
     }
 
@@ -194,7 +194,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     public void testReducedBamWithReadsNotFullySpanningDeletion() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
-                Arrays.asList("86bdd07a3ac4f6ce239c30efea8bf5ba"));
+                Arrays.asList("0e020dcfdf249225714f5cd86ed3869f"));
         executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
     }
 
@@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     public void HCTestDBSNPAnnotationWGS() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
-                Arrays.asList("7b23a288a31cafca3946f14f2381e7cb"));
+                Arrays.asList("446a786bb539f3ec2084dd75167568aa"));
         executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
     }
 
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java
index ff5a501cc..62e685eab 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java
@@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
         List<Object[]> tests = new ArrayList<Object[]>();
 
         for ( final int nct : Arrays.asList(1, 2, 4) ) {
-            tests.add(new Object[]{nct, "c277fd65365d59b734260dd8423313bb"});
+            tests.add(new Object[]{nct, "ef42a438b82681d1c0f921c57e16ff12"});
         }
 
         return tests.toArray(new Object[][]{});