diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index e0a755c7b..73367f8c3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -50,6 +50,7 @@ import com.google.java.contract.Ensures; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -387,6 +388,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) protected boolean dontUseSoftClippedBases = false; + @Hidden + @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) + protected boolean captureAssemblyFailureBAM = false; + @Hidden @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) protected boolean allowCyclesInKmerGraphToGeneratePaths = false; @@ -751,13 +756,24 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc); - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); - - if ( ! dontTrimActiveRegions ) { - return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc); - } else { - // we don't want to trim active regions, so go ahead and use the old one - return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true); + try { + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); + if ( ! dontTrimActiveRegions ) { + return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc); + } else { + // we don't want to trim active regions, so go ahead and use the old one + return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true); + } + } catch ( Exception e ) { + // Capture any exception that might be thrown, and write out the assembly failure BAM if requested + if ( captureAssemblyFailureBAM ) { + final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5); + for ( final GATKSAMRecord read : activeRegion.getReads() ) { + writer.addAlignment(read); + } + writer.close(); + } + throw e; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 3a377409c..1a5f34bc3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -78,6 +78,10 @@ import java.util.*; public abstract class LocalAssemblyEngine { private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class); + /** + * If false, we will only write out a region around the reference source + */ + private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true; public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8; private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30; @@ -252,20 +256,26 @@ public abstract class LocalAssemblyEngine { return false; } - protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) { - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + /** + * Print graph to file if debugGraphTransformations is enabled + * @param graph the graph to print + * @param file the destination file + */ + protected void printDebugGraphTransform(final BaseGraph graph, final File file) { + if ( debugGraphTransformations ) { + if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) + graph.printGraph(file, pruneFactor); + else + graph.subsetToRefSource().printGraph(file, pruneFactor); + } + } + + protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) { + printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot")); - // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm - // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect - // TODO -- to anything from one that's actually has good support along the chain but just happens - // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately - // TODO -- the pruning algorithm really should be an error correction algorithm that knows more - // TODO -- about the structure of the data and can differentiate between an infrequent path but - // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through) - // TODO -- from a error with lots of weight going along another similar path // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive seqGraph.zipLinearChains(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot")); // now go through and prune the graph, removing vertices no longer connected to the reference chain // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight @@ -273,9 +283,9 @@ public abstract class LocalAssemblyEngine { seqGraph.pruneGraph(pruneFactor); seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot")); seqGraph.simplifyGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.4.merged.dot")); // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can // happen in cases where for example the reference somehow manages to acquire a cycle, or @@ -294,7 +304,7 @@ public abstract class LocalAssemblyEngine { seqGraph.addVertex(dummy); seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); } - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); + printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot")); return seqGraph; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 8938af7c2..c963fb6e5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -388,6 +388,17 @@ public class BaseGraph extends Default return s; } + /** + * Get the set of vertices connected to v by incoming or outgoing edges + * @param v a non-null vertex + * @return a set of vertices {X} connected X -> v or v -> Y + */ + public Set neighboringVerticesOf(final V v) { + final Set s = incomingVerticesOf(v); + s.addAll(outgoingVerticesOf(v)); + return s; + } + /** * Print out the graph in the dot language for visualization * @param destination File to write to @@ -664,4 +675,54 @@ public class BaseGraph extends Default "kmerSize=" + kmerSize + '}'; } + + /** + * Get the set of vertices within distance edges of source, regardless of edge direction + * + * @param source the source vertex to consider + * @param distance the distance + * @return a set of vertices within distance of source + */ + protected Set verticesWithinDistance(final V source, final int distance) { + if ( distance == 0 ) + return Collections.singleton(source); + + final Set found = new HashSet<>(); + found.add(source); + for ( final V v : neighboringVerticesOf(source) ) { + found.addAll(verticesWithinDistance(v, distance - 1)); + } + + return found; + } + + /** + * Get a graph containing only the vertices within distance edges of target + * @param target a vertex in graph + * @param distance the max distance + * @return a non-null graph + */ + public BaseGraph subsetToNeighbors(final V target, final int distance) { + if ( target == null ) throw new IllegalArgumentException("Target cannot be null"); + if ( ! containsVertex(target) ) throw new IllegalArgumentException("Graph doesn't contain vertex " + target); + if ( distance < 0 ) throw new IllegalArgumentException("Distance must be >= 0 but got " + distance); + + + final Set toKeep = verticesWithinDistance(target, distance); + final Set toRemove = new HashSet<>(vertexSet()); + toRemove.removeAll(toKeep); + + final BaseGraph result = (BaseGraph)clone(); + result.removeAllVertices(toRemove); + + return result; + } + + /** + * Get a subgraph of graph that contains only vertices within 10 edges of the ref source vertex + * @return a non-null subgraph of this graph + */ + public BaseGraph subsetToRefSource() { + return subsetToNeighbors(getReferenceSourceVertex(), 10); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index 06c127a84..36c515073 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -155,20 +155,29 @@ public final class SeqGraph extends BaseGraph { //logger.info("simplifyGraph iteration " + i); // iterate until we haven't don't anything useful boolean didSomeWork = false; - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0); + printGraphSimplification(new File("simplifyGraph." + iteration + ".1.dot")); didSomeWork |= new MergeDiamonds().transformUntilComplete(); didSomeWork |= new MergeTails().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0); + printGraphSimplification(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot")); didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0); + printGraphSimplification(new File("simplifyGraph." + iteration + ".3.split_suffix.dot")); didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0); + printGraphSimplification(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot")); didSomeWork |= zipLinearChains(); return didSomeWork; } + /** + * Print simplication step of this graph, if PRINT_SIMPLIFY_GRAPHS is enabled + * @param file the destination for the graph DOT file + */ + private void printGraphSimplification(final File file) { + if ( PRINT_SIMPLIFY_GRAPHS ) + subsetToNeighbors(getReferenceSourceVertex(), 5).printGraph(file, 0); + } + /** * Zip up all of the simple linear chains present in this graph. * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java index 0babd8d56..5d725b1dd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java @@ -81,7 +81,7 @@ public class SharedSequenceMerger { else { // graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0); - final List edgesToRemove = new LinkedList(); + final List edgesToRemove = new LinkedList<>(); final byte[] prevSeq = prevs.iterator().next().getSequence(); final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence())); graph.addVertex(newV); @@ -124,11 +124,17 @@ public class SharedSequenceMerger { final SeqVertex first = incomingVertices.iterator().next(); for ( final SeqVertex prev : incomingVertices) { if ( ! prev.seqEquals(first) ) + // cannot merge if our sequence isn't the same as the first sequence return false; final Collection prevOuts = graph.outgoingVerticesOf(prev); if ( prevOuts.size() != 1 ) + // prev -> v must be the only edge from prev return false; if ( prevOuts.iterator().next() != v ) + // don't allow cyles + return false; + if ( graph.inDegreeOf(prev) == 0 ) + // cannot merge when any of the incoming nodes are sources return false; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index bd24891bc..123b36640 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -113,7 +113,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { // actually build the read threading graph rtgraph.buildGraphIfNecessary(); - if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.0.raw_readthreading_graph.dot"), pruneFactor); + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot")); // go through and prune all of the chains where all edges have <= pruneFactor. This must occur // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering @@ -128,7 +128,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { // remove all heading and trailing paths if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"), pruneFactor); + printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot")); final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); @@ -136,7 +136,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph); if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); - if ( debugGraphTransformations ) initialSeqGraph.printGraph(new File("sequenceGraph.0.2.initial_seqgraph.dot"), pruneFactor); + printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java index cfed2f0b8..e1398e119 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -166,4 +166,20 @@ public class CommonSuffixMergerUnitTest extends BaseTest { splitter.merge(data.graph, data.v); assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); } + + @Test + public void testDoesntMergeSourceNodes() { + final SeqGraph g = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("A"); + final SeqVertex v3 = new SeqVertex("A"); + final SeqVertex top = new SeqVertex("T"); + final SeqVertex b = new SeqVertex("C"); + g.addVertices(top, v1, v2, v3, top, b); + g.addEdges(top, v1, b); + g.addEdges(v2, b); // v2 doesn't have previous node, cannot be merged + g.addEdges(top, v3, b); + final SharedSequenceMerger merger = new SharedSequenceMerger(); + Assert.assertFalse(merger.merge(g, b), "Shouldn't be able to merge shared vertices, when one is a source"); + } }