From b115e5c582bb2ff9a32c1195071c09ca1ef401e9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 8 Apr 2013 14:00:51 -0400 Subject: [PATCH] Critical bugfix for CommonSuffixSplitter to avoid infinite loops -- The previous version would enter into an infinite loop in the case where we have a graph that looks like: X -> A -> B Y -> A -> B So that the incoming vertices of B all have the same sequence. This would cause us to remodel the graph endless by extracting the common sequence A and rebuilding exactly the same graph. Fixed and unit tested -- Additionally add a max to the number of simplification cycles that are run (100), which will throw an error and write out the graph for future debugging. So the GATK will always error out, rather than just go on forever -- After 5 rounds of simplification we start keeping a copy of the previous graph, and then check if the current graph is actually different from the previous graph. Equals here means that all vertices have equivalents in both graphs, as do all edges. If the two graphs are equal we stop simplifying. It can be a bit expensive but it only happens when we end up cycling due to the structure of the graph. -- Added a unittest that goes into an infinite loop (found empirically in running the CEU trio) and confirmed that the new approach aborts out correctly -- #resolves GSA-924 -- See https://jira.broadinstitute.org/browse/GSA-924 for more details -- Update MD5s due to change in assembly graph construction --- .../graphs/CommonSuffixSplitter.java | 33 ++++++-- .../haplotypecaller/graphs/SeqGraph.java | 79 +++++++++++++------ ...lexAndSymbolicVariantsIntegrationTest.java | 4 +- .../HaplotypeCallerIntegrationTest.java | 2 +- .../graphs/CommonSuffixSplitterUnitTest.java | 36 ++++++++- .../graphs/SeqGraphUnitTest.java | 18 +++++ 6 files changed, 134 insertions(+), 38 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java index 371d5b7e3..e37fbb281 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java @@ -90,21 +90,23 @@ public class CommonSuffixSplitter { if ( v == null ) throw new IllegalArgumentException("v cannot be null"); if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex v " + v); - final Collection toMerge = graph.incomingVerticesOf(v); - if ( toMerge.size() < 2 ) + final Collection toSplit = graph.incomingVerticesOf(v); + if ( toSplit.size() < 2 ) // Can only split at least 2 vertices return false; - else if ( ! safeToSplit(graph, v, toMerge) ) { + else if ( ! safeToSplit(graph, v, toSplit) ) { return false; } else { - final SeqVertex suffixVTemplate = commonSuffix(toMerge); + final SeqVertex suffixVTemplate = commonSuffix(toSplit); if ( suffixVTemplate.isEmpty() ) { return false; + } else if ( allVerticesAreTheCommonSuffix(suffixVTemplate, toSplit) ) { + return false; } else { final List edgesToRemove = new LinkedList(); // graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0); - for ( final SeqVertex mid : toMerge ) { + for ( final SeqVertex mid : toSplit ) { // create my own copy of the suffix final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence()); graph.addVertex(suffixV); @@ -130,7 +132,7 @@ public class CommonSuffixSplitter { } } - graph.removeAllVertices(toMerge); + graph.removeAllVertices(toSplit); graph.removeAllEdges(edgesToRemove); // graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0); @@ -141,6 +143,25 @@ public class CommonSuffixSplitter { // private static int counter = 0; + /** + * Would all vertices that we'd split just result in the common suffix? + * + * That is, suppose we have prefix nodes ABC and ABC. After splitting all of the vertices would + * just be ABC again, and we'd enter into an infinite loop. + * + * @param commonSuffix the common suffix of all vertices in toSplits + * @param toSplits the collection of vertices we want to split + * @return true if all of the vertices are equal to the common suffix + */ + private boolean allVerticesAreTheCommonSuffix(final SeqVertex commonSuffix, final Collection toSplits) { + for ( final SeqVertex toSplit : toSplits ) { + if ( toSplit.length() != commonSuffix.length() ) + return false; + } + + return true; + } + /** * Can we safely split up the vertices in toMerge? * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index 8c78d8515..bb4b26257 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -72,6 +72,12 @@ public final class SeqGraph extends BaseGraph { */ protected final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10; + /** + * How many cycles of the graph simplifications algorithms will we run before + * thinking something has gone wrong and throw an exception? + */ + private final static int MAX_REASONABLE_SIMPLIFICATION_CYCLES = 100; + /** * Construct an empty SeqGraph */ @@ -101,29 +107,56 @@ public final class SeqGraph extends BaseGraph { } protected void simplifyGraph(final int maxCycles) { - boolean didSomeWork; - int i = 0; - // start off with one round of zipping of chains for performance reasons zipLinearChains(); - do { - //logger.info("simplifyGraph iteration " + i); - // iterate until we haven't don't anything useful - didSomeWork = false; - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".1.dot"), 0); - didSomeWork |= new MergeDiamonds().transformUntilComplete(); - didSomeWork |= new MergeTails().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".2.diamonds_and_tails.dot"), 0); - didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".3.split_suffix.dot"), 0); - didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".4.merge_suffix.dot"), 0); + SeqGraph prevGraph = null; + for( int i = 0; i < maxCycles; i++ ) { + if ( i > MAX_REASONABLE_SIMPLIFICATION_CYCLES ) { + logger.warn("Infinite loop detected in simpliciation routines. Writing current graph to debugMeMark.dot"); + printGraph(new File("debugMeMark.dot"), 0); + throw new IllegalStateException("Infinite loop detected in simplification routines for kmer graph " + getKmerSize()); + } - didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); - didSomeWork |= zipLinearChains(); - i++; - } while (didSomeWork && i < maxCycles); + final boolean didSomeWork = simplifyGraphOnce(i); + if ( ! didSomeWork ) + // no simplification algorithm could run, so stop + break; + + // we get five cycles before we start looking for changes in the graph + // by cloning ourselves and then checking for any changes + if ( i > 5 ) { + // the previous graph and this graph have the same structure, so the simplification + // algorithms are looping endless between states. Just break and consider ourselves done + if ( prevGraph != null && graphEquals(prevGraph, this) ) + break; + + prevGraph = (SeqGraph)clone(); + } + } + } + + /** + * Run one full cycle of the graph simplification algorithms + * @return true if any algorithms said they did some simplification + */ + private boolean simplifyGraphOnce(final int iteration) { + //logger.info("simplifyGraph iteration " + i); + // iterate until we haven't don't anything useful + boolean didSomeWork = false; + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0); + didSomeWork |= new MergeDiamonds().transformUntilComplete(); + didSomeWork |= new MergeTails().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0); + + didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0); + didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0); + + didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); + didSomeWork |= zipLinearChains(); + return didSomeWork; } /** @@ -431,15 +464,13 @@ public final class SeqGraph extends BaseGraph { * * Performs the transformation: * - * { x + S_i + y -> Z } + * { x + S_i -> y -> Z } * * goes to: * - * { x -> S_i -> y -> Z } + * { x -> S_i -> y + Z } * * for all nodes that match this configuration. - * - * Differs from the diamond transform in that no top node is required */ protected class MergeCommonSuffices extends VertexBasedTransformer { @Override @@ -449,8 +480,6 @@ public final class SeqGraph extends BaseGraph { } /** - * Merge headless configurations: - * * Performs the transformation: * * { x + S_i + y -> Z } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index d6fb3b70a..6d85421c4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,8 +63,8 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } @Test - public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6b673efb6f12b5deebb3e63fe94c48ed"); + public void testHaplotypeCallerMultiSampleComplex1() { + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "490ecf6619740c01c81a463392ef23cf"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 7b7f4d9cc..573cc83fd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6fd839641ee038048626fbd1154f173")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java index 8006cb18d..1ed20e5f4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java @@ -102,8 +102,8 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { public void testSplitNoCycles() { final SeqGraph original = new SeqGraph(); final SeqVertex v1 = new SeqVertex("A"); - final SeqVertex v2 = new SeqVertex("C"); - final SeqVertex v3 = new SeqVertex("C"); + final SeqVertex v2 = new SeqVertex("AC"); + final SeqVertex v3 = new SeqVertex("TC"); final SeqVertex v4 = new SeqVertex("G"); original.addVertices(v1, v2, v3, v4); @@ -116,7 +116,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list"); } - @Test(timeOut = 10000) + @Test(timeOut = 10000, enabled = !DEBUG) public void testSplitComplexCycle() { final SeqGraph original = new SeqGraph(); final SeqVertex r1 = new SeqVertex("ACTG"); @@ -130,7 +130,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { original.addEdges(r1, cat1, c1, cat2, c1); original.addEdges(r2, c2, cat2); - original.printGraph(new File("testSplitComplexCycle.dot"), 0); + //original.printGraph(new File("testSplitComplexCycle.dot"), 0); for ( final SeqVertex v : Arrays.asList(cat2) ) { // original.vertexSet() ) { final SeqGraph graph = (SeqGraph)original.clone(); @@ -139,4 +139,32 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { Assert.assertFalse(success, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v); } } + + @Test(timeOut = 10000) + public void testSplitInfiniteCycleFailure() { + final SeqGraph original = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("GC"); + final SeqVertex v2 = new SeqVertex("X"); + final SeqVertex v3 = new SeqVertex("N"); + final SeqVertex v4 = new SeqVertex("C"); + + original.addVertices(v1, v2, v3, v4); + original.addEdge(v1, v2, new BaseEdge(false, 12)); + original.addEdge(v2, v3, new BaseEdge(false, 23)); + original.addEdge(v3, v4, new BaseEdge(false, 34)); + original.addEdge(v4, v2, new BaseEdge(false, 42)); + + original.printGraph(new File("testSplitInfiniteCycleFailure.dot"), 0); + + final SeqGraph graph = (SeqGraph)original.clone(); + final boolean success = new CommonSuffixSplitter().split(graph, v2); + Assert.assertTrue(success); + + for ( final SeqVertex v : graph.vertexSet() ) { + graph.printGraph(new File("testSplitInfiniteCycleFailure.first_split.dot"), 0); + final boolean success2 = new CommonSuffixSplitter().split((SeqGraph)graph.clone(), v); + if ( success2 ) graph.printGraph(new File("testSplitInfiniteCycleFailure.fail.dot"), 0); + Assert.assertFalse(success2, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v); + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index 42137e4e4..bd2e3cc2c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -521,4 +521,22 @@ public class SeqGraphUnitTest extends BaseTest { throw e; } } + + @Test(timeOut = 10000) + public void testInfiniteCycleFromEmpiricalRuns() { + final SeqVertex v1 = new SeqVertex("CCCT"); + final SeqVertex v2 = new SeqVertex("CATCCTCCCTTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCC"); + final SeqVertex v3 = new SeqVertex("CTAGACTTCTCCTCCTCCTCC"); + final SeqVertex v4 = new SeqVertex("ACCATC"); + final SeqVertex v5 = new SeqVertex("CCTCCACCATCCTCCCCTCTAGGCTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATC"); + final SeqVertex v6 = new SeqVertex("CTCCCCT"); + + final SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4, v5, v6); + graph.addEdges(v1, v3, v4, v6, v3); + graph.addEdges(v2, v4); + graph.addEdges(v5, v6); + + graph.simplifyGraph(); + } }