From c9f5b53efa8307add66b4f1fc1d689a0818db443 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Jun 2013 14:36:54 -0400 Subject: [PATCH] Bugfix for HC can fail to assemble the correct reference sequence in some cases -- Ultimately this was caused by overly aggressive merging of CommonSuffixMerger. In the case where you have this graph: ACT [ref source] -> C G -> ACT -> C we would merge into G -> ACT -> C which would linearlize into GACTC Causing us to add bases to the reference source node that couldn't be recovered. The solution was to ensure that CommonSuffixMerger only operates when all nodes to be merged aren't source nodes themselves. -- Added a convenient argument to the haplotype caller (captureAssemblyFailureBAM) that will write out the exact reads to a BAM file that went into a failed assembly run (going to a file called AssemblyFailure.BAM). This can be used to rerun the haplotype caller to produce the exact error, which can be hard in regions of deep coverage where the downsampler state determines the exact reads going into assembly and therefore makes running with a sub-interval not reproduce the error -- Did some misc. cleanup of code while debugging -- [delivers #50917729] --- .../haplotypecaller/HaplotypeCaller.java | 30 ++++++--- .../haplotypecaller/LocalAssemblyEngine.java | 38 +++++++----- .../haplotypecaller/graphs/BaseGraph.java | 61 +++++++++++++++++++ .../haplotypecaller/graphs/SeqGraph.java | 17 ++++-- .../graphs/SharedSequenceMerger.java | 8 ++- .../readthreading/ReadThreadingAssembler.java | 6 +- .../graphs/CommonSuffixMergerUnitTest.java | 16 +++++ 7 files changed, 147 insertions(+), 29 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index e0a755c7b..73367f8c3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -50,6 +50,7 @@ import com.google.java.contract.Ensures; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -387,6 +388,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) protected boolean dontUseSoftClippedBases = false; + @Hidden + @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) + protected boolean captureAssemblyFailureBAM = false; + @Hidden @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) protected boolean allowCyclesInKmerGraphToGeneratePaths = false; @@ -751,13 +756,24 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc); - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); - - if ( ! dontTrimActiveRegions ) { - return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc); - } else { - // we don't want to trim active regions, so go ahead and use the old one - return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true); + try { + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); + if ( ! dontTrimActiveRegions ) { + return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc); + } else { + // we don't want to trim active regions, so go ahead and use the old one + return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true); + } + } catch ( Exception e ) { + // Capture any exception that might be thrown, and write out the assembly failure BAM if requested + if ( captureAssemblyFailureBAM ) { + final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5); + for ( final GATKSAMRecord read : activeRegion.getReads() ) { + writer.addAlignment(read); + } + writer.close(); + } + throw e; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 3a377409c..1a5f34bc3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -78,6 +78,10 @@ import java.util.*; public abstract class LocalAssemblyEngine { private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class); + /** + * If false, we will only write out a region around the reference source + */ + private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true; public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8; private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30; @@ -252,20 +256,26 @@ public abstract class LocalAssemblyEngine { return false; } - protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) { - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + /** + * Print graph to file if debugGraphTransformations is enabled + * @param graph the graph to print + * @param file the destination file + */ + protected void printDebugGraphTransform(final BaseGraph graph, final File file) { + if ( debugGraphTransformations ) { + if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) + graph.printGraph(file, pruneFactor); + else + graph.subsetToRefSource().printGraph(file, pruneFactor); + } + } + + protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) { + printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot")); - // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm - // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect - // TODO -- to anything from one that's actually has good support along the chain but just happens - // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately - // TODO -- the pruning algorithm really should be an error correction algorithm that knows more - // TODO -- about the structure of the data and can differentiate between an infrequent path but - // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through) - // TODO -- from a error with lots of weight going along another similar path // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive seqGraph.zipLinearChains(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot")); // now go through and prune the graph, removing vertices no longer connected to the reference chain // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight @@ -273,9 +283,9 @@ public abstract class LocalAssemblyEngine { seqGraph.pruneGraph(pruneFactor); seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot")); seqGraph.simplifyGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.4.merged.dot")); // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can // happen in cases where for example the reference somehow manages to acquire a cycle, or @@ -294,7 +304,7 @@ public abstract class LocalAssemblyEngine { seqGraph.addVertex(dummy); seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); } - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot")); return seqGraph; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 8938af7c2..c963fb6e5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -388,6 +388,17 @@ public class BaseGraph extends Default return s; } + /** + * Get the set of vertices connected to v by incoming or outgoing edges + * @param v a non-null vertex + * @return a set of vertices {X} connected X -> v or v -> Y + */ + public Set neighboringVerticesOf(final V v) { + final Set s = incomingVerticesOf(v); + s.addAll(outgoingVerticesOf(v)); + return s; + } + /** * Print out the graph in the dot language for visualization * @param destination File to write to @@ -664,4 +675,54 @@ public class BaseGraph extends Default "kmerSize=" + kmerSize + '}'; } + + /** + * Get the set of vertices within distance edges of source, regardless of edge direction + * + * @param source the source vertex to consider + * @param distance the distance + * @return a set of vertices within distance of source + */ + protected Set verticesWithinDistance(final V source, final int distance) { + if ( distance == 0 ) + return Collections.singleton(source); + + final Set found = new HashSet<>(); + found.add(source); + for ( final V v : neighboringVerticesOf(source) ) { + found.addAll(verticesWithinDistance(v, distance - 1)); + } + + return found; + } + + /** + * Get a graph containing only the vertices within distance edges of target + * @param target a vertex in graph + * @param distance the max distance + * @return a non-null graph + */ + public BaseGraph subsetToNeighbors(final V target, final int distance) { + if ( target == null ) throw new IllegalArgumentException("Target cannot be null"); + if ( ! containsVertex(target) ) throw new IllegalArgumentException("Graph doesn't contain vertex " + target); + if ( distance < 0 ) throw new IllegalArgumentException("Distance must be >= 0 but got " + distance); + + + final Set toKeep = verticesWithinDistance(target, distance); + final Set toRemove = new HashSet<>(vertexSet()); + toRemove.removeAll(toKeep); + + final BaseGraph result = (BaseGraph)clone(); + result.removeAllVertices(toRemove); + + return result; + } + + /** + * Get a subgraph of graph that contains only vertices within 10 edges of the ref source vertex + * @return a non-null subgraph of this graph + */ + public BaseGraph subsetToRefSource() { + return subsetToNeighbors(getReferenceSourceVertex(), 10); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index 06c127a84..36c515073 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -155,20 +155,29 @@ public final class SeqGraph extends BaseGraph { //logger.info("simplifyGraph iteration " + i); // iterate until we haven't don't anything useful boolean didSomeWork = false; - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0); + printGraphSimplification(new File("simplifyGraph." + iteration + ".1.dot")); didSomeWork |= new MergeDiamonds().transformUntilComplete(); didSomeWork |= new MergeTails().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0); + printGraphSimplification(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot")); didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0); + printGraphSimplification(new File("simplifyGraph." + iteration + ".3.split_suffix.dot")); didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0); + printGraphSimplification(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot")); didSomeWork |= zipLinearChains(); return didSomeWork; } + /** + * Print simplication step of this graph, if PRINT_SIMPLIFY_GRAPHS is enabled + * @param file the destination for the graph DOT file + */ + private void printGraphSimplification(final File file) { + if ( PRINT_SIMPLIFY_GRAPHS ) + subsetToNeighbors(getReferenceSourceVertex(), 5).printGraph(file, 0); + } + /** * Zip up all of the simple linear chains present in this graph. * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java index 0babd8d56..5d725b1dd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java @@ -81,7 +81,7 @@ public class SharedSequenceMerger { else { // graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0); - final List edgesToRemove = new LinkedList(); + final List edgesToRemove = new LinkedList<>(); final byte[] prevSeq = prevs.iterator().next().getSequence(); final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence())); graph.addVertex(newV); @@ -124,11 +124,17 @@ public class SharedSequenceMerger { final SeqVertex first = incomingVertices.iterator().next(); for ( final SeqVertex prev : incomingVertices) { if ( ! prev.seqEquals(first) ) + // cannot merge if our sequence isn't the same as the first sequence return false; final Collection prevOuts = graph.outgoingVerticesOf(prev); if ( prevOuts.size() != 1 ) + // prev -> v must be the only edge from prev return false; if ( prevOuts.iterator().next() != v ) + // don't allow cyles + return false; + if ( graph.inDegreeOf(prev) == 0 ) + // cannot merge when any of the incoming nodes are sources return false; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index bd24891bc..123b36640 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -113,7 +113,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { // actually build the read threading graph rtgraph.buildGraphIfNecessary(); - if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.0.raw_readthreading_graph.dot"), pruneFactor); + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot")); // go through and prune all of the chains where all edges have <= pruneFactor. This must occur // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering @@ -128,7 +128,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { // remove all heading and trailing paths if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"), pruneFactor); + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot")); final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); @@ -136,7 +136,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph); if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); - if ( debugGraphTransformations ) initialSeqGraph.printGraph(new File("sequenceGraph.0.2.initial_seqgraph.dot"), pruneFactor); + printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java index cfed2f0b8..e1398e119 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -166,4 +166,20 @@ public class CommonSuffixMergerUnitTest extends BaseTest { splitter.merge(data.graph, data.v); assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); } + + @Test + public void testDoesntMergeSourceNodes() { + final SeqGraph g = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("A"); + final SeqVertex v3 = new SeqVertex("A"); + final SeqVertex top = new SeqVertex("T"); + final SeqVertex b = new SeqVertex("C"); + g.addVertices(top, v1, v2, v3, top, b); + g.addEdges(top, v1, b); + g.addEdges(v2, b); // v2 doesn't have previous node, cannot be merged + g.addEdges(top, v3, b); + final SharedSequenceMerger merger = new SharedSequenceMerger(); + Assert.assertFalse(merger.merge(g, b), "Shouldn't be able to merge shared vertices, when one is a source"); + } }