diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java index 371d5b7e3..e37fbb281 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java @@ -90,21 +90,23 @@ public class CommonSuffixSplitter { if ( v == null ) throw new IllegalArgumentException("v cannot be null"); if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex v " + v); - final Collection toMerge = graph.incomingVerticesOf(v); - if ( toMerge.size() < 2 ) + final Collection toSplit = graph.incomingVerticesOf(v); + if ( toSplit.size() < 2 ) // Can only split at least 2 vertices return false; - else if ( ! safeToSplit(graph, v, toMerge) ) { + else if ( ! safeToSplit(graph, v, toSplit) ) { return false; } else { - final SeqVertex suffixVTemplate = commonSuffix(toMerge); + final SeqVertex suffixVTemplate = commonSuffix(toSplit); if ( suffixVTemplate.isEmpty() ) { return false; + } else if ( allVerticesAreTheCommonSuffix(suffixVTemplate, toSplit) ) { + return false; } else { final List edgesToRemove = new LinkedList(); // graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0); - for ( final SeqVertex mid : toMerge ) { + for ( final SeqVertex mid : toSplit ) { // create my own copy of the suffix final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence()); graph.addVertex(suffixV); @@ -130,7 +132,7 @@ public class CommonSuffixSplitter { } } - graph.removeAllVertices(toMerge); + graph.removeAllVertices(toSplit); graph.removeAllEdges(edgesToRemove); // graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0); @@ -141,6 +143,25 @@ public class CommonSuffixSplitter { // private static int counter = 0; + /** + * Would all vertices that we'd split just result in the common suffix? + * + * That is, suppose we have prefix nodes ABC and ABC. After splitting all of the vertices would + * just be ABC again, and we'd enter into an infinite loop. + * + * @param commonSuffix the common suffix of all vertices in toSplits + * @param toSplits the collection of vertices we want to split + * @return true if all of the vertices are equal to the common suffix + */ + private boolean allVerticesAreTheCommonSuffix(final SeqVertex commonSuffix, final Collection toSplits) { + for ( final SeqVertex toSplit : toSplits ) { + if ( toSplit.length() != commonSuffix.length() ) + return false; + } + + return true; + } + /** * Can we safely split up the vertices in toMerge? * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index 8c78d8515..bb4b26257 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -72,6 +72,12 @@ public final class SeqGraph extends BaseGraph { */ protected final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10; + /** + * How many cycles of the graph simplifications algorithms will we run before + * thinking something has gone wrong and throw an exception? + */ + private final static int MAX_REASONABLE_SIMPLIFICATION_CYCLES = 100; + /** * Construct an empty SeqGraph */ @@ -101,29 +107,56 @@ public final class SeqGraph extends BaseGraph { } protected void simplifyGraph(final int maxCycles) { - boolean didSomeWork; - int i = 0; - // start off with one round of zipping of chains for performance reasons zipLinearChains(); - do { - //logger.info("simplifyGraph iteration " + i); - // iterate until we haven't don't anything useful - didSomeWork = false; - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".1.dot"), 0); - didSomeWork |= new MergeDiamonds().transformUntilComplete(); - didSomeWork |= new MergeTails().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".2.diamonds_and_tails.dot"), 0); - didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".3.split_suffix.dot"), 0); - didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".4.merge_suffix.dot"), 0); + SeqGraph prevGraph = null; + for( int i = 0; i < maxCycles; i++ ) { + if ( i > MAX_REASONABLE_SIMPLIFICATION_CYCLES ) { + logger.warn("Infinite loop detected in simpliciation routines. Writing current graph to debugMeMark.dot"); + printGraph(new File("debugMeMark.dot"), 0); + throw new IllegalStateException("Infinite loop detected in simplification routines for kmer graph " + getKmerSize()); + } - didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); - didSomeWork |= zipLinearChains(); - i++; - } while (didSomeWork && i < maxCycles); + final boolean didSomeWork = simplifyGraphOnce(i); + if ( ! didSomeWork ) + // no simplification algorithm could run, so stop + break; + + // we get five cycles before we start looking for changes in the graph + // by cloning ourselves and then checking for any changes + if ( i > 5 ) { + // the previous graph and this graph have the same structure, so the simplification + // algorithms are looping endless between states. Just break and consider ourselves done + if ( prevGraph != null && graphEquals(prevGraph, this) ) + break; + + prevGraph = (SeqGraph)clone(); + } + } + } + + /** + * Run one full cycle of the graph simplification algorithms + * @return true if any algorithms said they did some simplification + */ + private boolean simplifyGraphOnce(final int iteration) { + //logger.info("simplifyGraph iteration " + i); + // iterate until we haven't don't anything useful + boolean didSomeWork = false; + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0); + didSomeWork |= new MergeDiamonds().transformUntilComplete(); + didSomeWork |= new MergeTails().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0); + + didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0); + didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0); + + didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); + didSomeWork |= zipLinearChains(); + return didSomeWork; } /** @@ -431,15 +464,13 @@ public final class SeqGraph extends BaseGraph { * * Performs the transformation: * - * { x + S_i + y -> Z } + * { x + S_i -> y -> Z } * * goes to: * - * { x -> S_i -> y -> Z } + * { x -> S_i -> y + Z } * * for all nodes that match this configuration. - * - * Differs from the diamond transform in that no top node is required */ protected class MergeCommonSuffices extends VertexBasedTransformer { @Override @@ -449,8 +480,6 @@ public final class SeqGraph extends BaseGraph { } /** - * Merge headless configurations: - * * Performs the transformation: * * { x + S_i + y -> Z } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index d6fb3b70a..6d85421c4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,8 +63,8 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } @Test - public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6b673efb6f12b5deebb3e63fe94c48ed"); + public void testHaplotypeCallerMultiSampleComplex1() { + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "490ecf6619740c01c81a463392ef23cf"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 7b7f4d9cc..573cc83fd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6fd839641ee038048626fbd1154f173")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java index 8006cb18d..1ed20e5f4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java @@ -102,8 +102,8 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { public void testSplitNoCycles() { final SeqGraph original = new SeqGraph(); final SeqVertex v1 = new SeqVertex("A"); - final SeqVertex v2 = new SeqVertex("C"); - final SeqVertex v3 = new SeqVertex("C"); + final SeqVertex v2 = new SeqVertex("AC"); + final SeqVertex v3 = new SeqVertex("TC"); final SeqVertex v4 = new SeqVertex("G"); original.addVertices(v1, v2, v3, v4); @@ -116,7 +116,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list"); } - @Test(timeOut = 10000) + @Test(timeOut = 10000, enabled = !DEBUG) public void testSplitComplexCycle() { final SeqGraph original = new SeqGraph(); final SeqVertex r1 = new SeqVertex("ACTG"); @@ -130,7 +130,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { original.addEdges(r1, cat1, c1, cat2, c1); original.addEdges(r2, c2, cat2); - original.printGraph(new File("testSplitComplexCycle.dot"), 0); + //original.printGraph(new File("testSplitComplexCycle.dot"), 0); for ( final SeqVertex v : Arrays.asList(cat2) ) { // original.vertexSet() ) { final SeqGraph graph = (SeqGraph)original.clone(); @@ -139,4 +139,32 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { Assert.assertFalse(success, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v); } } + + @Test(timeOut = 10000) + public void testSplitInfiniteCycleFailure() { + final SeqGraph original = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("GC"); + final SeqVertex v2 = new SeqVertex("X"); + final SeqVertex v3 = new SeqVertex("N"); + final SeqVertex v4 = new SeqVertex("C"); + + original.addVertices(v1, v2, v3, v4); + original.addEdge(v1, v2, new BaseEdge(false, 12)); + original.addEdge(v2, v3, new BaseEdge(false, 23)); + original.addEdge(v3, v4, new BaseEdge(false, 34)); + original.addEdge(v4, v2, new BaseEdge(false, 42)); + + original.printGraph(new File("testSplitInfiniteCycleFailure.dot"), 0); + + final SeqGraph graph = (SeqGraph)original.clone(); + final boolean success = new CommonSuffixSplitter().split(graph, v2); + Assert.assertTrue(success); + + for ( final SeqVertex v : graph.vertexSet() ) { + graph.printGraph(new File("testSplitInfiniteCycleFailure.first_split.dot"), 0); + final boolean success2 = new CommonSuffixSplitter().split((SeqGraph)graph.clone(), v); + if ( success2 ) graph.printGraph(new File("testSplitInfiniteCycleFailure.fail.dot"), 0); + Assert.assertFalse(success2, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v); + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index 42137e4e4..bd2e3cc2c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -521,4 +521,22 @@ public class SeqGraphUnitTest extends BaseTest { throw e; } } + + @Test(timeOut = 10000) + public void testInfiniteCycleFromEmpiricalRuns() { + final SeqVertex v1 = new SeqVertex("CCCT"); + final SeqVertex v2 = new SeqVertex("CATCCTCCCTTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCC"); + final SeqVertex v3 = new SeqVertex("CTAGACTTCTCCTCCTCCTCC"); + final SeqVertex v4 = new SeqVertex("ACCATC"); + final SeqVertex v5 = new SeqVertex("CCTCCACCATCCTCCCCTCTAGGCTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATC"); + final SeqVertex v6 = new SeqVertex("CTCCCCT"); + + final SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4, v5, v6); + graph.addEdges(v1, v3, v4, v6, v3); + graph.addEdges(v2, v4); + graph.addEdges(v5, v6); + + graph.simplifyGraph(); + } }