Merge pull request #152 from broadinstitute/md_commonsuffix_infinite_loop_bugfix

Critical bugfix for CommonSuffixSplitter to avoid infinite loops
This commit is contained in:
Eric Banks 2013-04-09 17:43:11 -07:00
commit 2fb8b61dd0
6 changed files with 134 additions and 38 deletions

View File

@ -90,21 +90,23 @@ public class CommonSuffixSplitter {
if ( v == null ) throw new IllegalArgumentException("v cannot be null");
if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex v " + v);
final Collection<SeqVertex> toMerge = graph.incomingVerticesOf(v);
if ( toMerge.size() < 2 )
final Collection<SeqVertex> toSplit = graph.incomingVerticesOf(v);
if ( toSplit.size() < 2 )
// Can only split at least 2 vertices
return false;
else if ( ! safeToSplit(graph, v, toMerge) ) {
else if ( ! safeToSplit(graph, v, toSplit) ) {
return false;
} else {
final SeqVertex suffixVTemplate = commonSuffix(toMerge);
final SeqVertex suffixVTemplate = commonSuffix(toSplit);
if ( suffixVTemplate.isEmpty() ) {
return false;
} else if ( allVerticesAreTheCommonSuffix(suffixVTemplate, toSplit) ) {
return false;
} else {
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
// graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0);
for ( final SeqVertex mid : toMerge ) {
for ( final SeqVertex mid : toSplit ) {
// create my own copy of the suffix
final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence());
graph.addVertex(suffixV);
@ -130,7 +132,7 @@ public class CommonSuffixSplitter {
}
}
graph.removeAllVertices(toMerge);
graph.removeAllVertices(toSplit);
graph.removeAllEdges(edgesToRemove);
// graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0);
@ -141,6 +143,25 @@ public class CommonSuffixSplitter {
// private static int counter = 0;
/**
* Would all vertices that we'd split just result in the common suffix?
*
* That is, suppose we have prefix nodes ABC and ABC. After splitting all of the vertices would
* just be ABC again, and we'd enter into an infinite loop.
*
* @param commonSuffix the common suffix of all vertices in toSplits
* @param toSplits the collection of vertices we want to split
* @return true if all of the vertices are equal to the common suffix
*/
private boolean allVerticesAreTheCommonSuffix(final SeqVertex commonSuffix, final Collection<SeqVertex> toSplits) {
for ( final SeqVertex toSplit : toSplits ) {
if ( toSplit.length() != commonSuffix.length() )
return false;
}
return true;
}
/**
* Can we safely split up the vertices in toMerge?
*

View File

@ -72,6 +72,12 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
*/
protected final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10;
/**
* How many cycles of the graph simplifications algorithms will we run before
* thinking something has gone wrong and throw an exception?
*/
private final static int MAX_REASONABLE_SIMPLIFICATION_CYCLES = 100;
/**
* Construct an empty SeqGraph
*/
@ -101,29 +107,56 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
}
protected void simplifyGraph(final int maxCycles) {
boolean didSomeWork;
int i = 0;
// start off with one round of zipping of chains for performance reasons
zipLinearChains();
do {
//logger.info("simplifyGraph iteration " + i);
// iterate until we haven't don't anything useful
didSomeWork = false;
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".1.dot"), 0);
didSomeWork |= new MergeDiamonds().transformUntilComplete();
didSomeWork |= new MergeTails().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".2.diamonds_and_tails.dot"), 0);
didSomeWork |= new SplitCommonSuffices().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".3.split_suffix.dot"), 0);
didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".4.merge_suffix.dot"), 0);
SeqGraph prevGraph = null;
for( int i = 0; i < maxCycles; i++ ) {
if ( i > MAX_REASONABLE_SIMPLIFICATION_CYCLES ) {
logger.warn("Infinite loop detected in simpliciation routines. Writing current graph to debugMeMark.dot");
printGraph(new File("debugMeMark.dot"), 0);
throw new IllegalStateException("Infinite loop detected in simplification routines for kmer graph " + getKmerSize());
}
didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete();
didSomeWork |= zipLinearChains();
i++;
} while (didSomeWork && i < maxCycles);
final boolean didSomeWork = simplifyGraphOnce(i);
if ( ! didSomeWork )
// no simplification algorithm could run, so stop
break;
// we get five cycles before we start looking for changes in the graph
// by cloning ourselves and then checking for any changes
if ( i > 5 ) {
// the previous graph and this graph have the same structure, so the simplification
// algorithms are looping endless between states. Just break and consider ourselves done
if ( prevGraph != null && graphEquals(prevGraph, this) )
break;
prevGraph = (SeqGraph)clone();
}
}
}
/**
* Run one full cycle of the graph simplification algorithms
* @return true if any algorithms said they did some simplification
*/
private boolean simplifyGraphOnce(final int iteration) {
//logger.info("simplifyGraph iteration " + i);
// iterate until we haven't don't anything useful
boolean didSomeWork = false;
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0);
didSomeWork |= new MergeDiamonds().transformUntilComplete();
didSomeWork |= new MergeTails().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0);
didSomeWork |= new SplitCommonSuffices().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0);
didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0);
didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete();
didSomeWork |= zipLinearChains();
return didSomeWork;
}
/**
@ -431,15 +464,13 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
*
* Performs the transformation:
*
* { x + S_i + y -> Z }
* { x + S_i -> y -> Z }
*
* goes to:
*
* { x -> S_i -> y -> Z }
* { x -> S_i -> y + Z }
*
* for all nodes that match this configuration.
*
* Differs from the diamond transform in that no top node is required
*/
protected class MergeCommonSuffices extends VertexBasedTransformer {
@Override
@ -449,8 +480,6 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
}
/**
* Merge headless configurations:
*
* Performs the transformation:
*
* { x + S_i + y -> Z }

View File

@ -63,8 +63,8 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
}
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6b673efb6f12b5deebb3e63fe94c48ed");
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "490ecf6619740c01c81a463392ef23cf");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {

View File

@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6fd839641ee038048626fbd1154f173"));
executeTest("HCTestStructuralIndels: ", spec);
}

View File

@ -102,8 +102,8 @@ public class CommonSuffixSplitterUnitTest extends BaseTest {
public void testSplitNoCycles() {
final SeqGraph original = new SeqGraph();
final SeqVertex v1 = new SeqVertex("A");
final SeqVertex v2 = new SeqVertex("C");
final SeqVertex v3 = new SeqVertex("C");
final SeqVertex v2 = new SeqVertex("AC");
final SeqVertex v3 = new SeqVertex("TC");
final SeqVertex v4 = new SeqVertex("G");
original.addVertices(v1, v2, v3, v4);
@ -116,7 +116,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest {
Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list");
}
@Test(timeOut = 10000)
@Test(timeOut = 10000, enabled = !DEBUG)
public void testSplitComplexCycle() {
final SeqGraph original = new SeqGraph();
final SeqVertex r1 = new SeqVertex("ACTG");
@ -130,7 +130,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest {
original.addEdges(r1, cat1, c1, cat2, c1);
original.addEdges(r2, c2, cat2);
original.printGraph(new File("testSplitComplexCycle.dot"), 0);
//original.printGraph(new File("testSplitComplexCycle.dot"), 0);
for ( final SeqVertex v : Arrays.asList(cat2) ) { // original.vertexSet() ) {
final SeqGraph graph = (SeqGraph)original.clone();
@ -139,4 +139,32 @@ public class CommonSuffixSplitterUnitTest extends BaseTest {
Assert.assertFalse(success, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v);
}
}
@Test(timeOut = 10000)
public void testSplitInfiniteCycleFailure() {
final SeqGraph original = new SeqGraph();
final SeqVertex v1 = new SeqVertex("GC");
final SeqVertex v2 = new SeqVertex("X");
final SeqVertex v3 = new SeqVertex("N");
final SeqVertex v4 = new SeqVertex("C");
original.addVertices(v1, v2, v3, v4);
original.addEdge(v1, v2, new BaseEdge(false, 12));
original.addEdge(v2, v3, new BaseEdge(false, 23));
original.addEdge(v3, v4, new BaseEdge(false, 34));
original.addEdge(v4, v2, new BaseEdge(false, 42));
original.printGraph(new File("testSplitInfiniteCycleFailure.dot"), 0);
final SeqGraph graph = (SeqGraph)original.clone();
final boolean success = new CommonSuffixSplitter().split(graph, v2);
Assert.assertTrue(success);
for ( final SeqVertex v : graph.vertexSet() ) {
graph.printGraph(new File("testSplitInfiniteCycleFailure.first_split.dot"), 0);
final boolean success2 = new CommonSuffixSplitter().split((SeqGraph)graph.clone(), v);
if ( success2 ) graph.printGraph(new File("testSplitInfiniteCycleFailure.fail.dot"), 0);
Assert.assertFalse(success2, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v);
}
}
}

View File

@ -521,4 +521,22 @@ public class SeqGraphUnitTest extends BaseTest {
throw e;
}
}
@Test(timeOut = 10000)
public void testInfiniteCycleFromEmpiricalRuns() {
final SeqVertex v1 = new SeqVertex("CCCT");
final SeqVertex v2 = new SeqVertex("CATCCTCCCTTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCC");
final SeqVertex v3 = new SeqVertex("CTAGACTTCTCCTCCTCCTCC");
final SeqVertex v4 = new SeqVertex("ACCATC");
final SeqVertex v5 = new SeqVertex("CCTCCACCATCCTCCCCTCTAGGCTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATC");
final SeqVertex v6 = new SeqVertex("CTCCCCT");
final SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3, v4, v5, v6);
graph.addEdges(v1, v3, v4, v6, v3);
graph.addEdges(v2, v4);
graph.addEdges(v5, v6);
graph.simplifyGraph();
}
}