Bugfix for head and tail merging code in SeqGraph
-- The previous version of the head merging (and tail merging to a lesser degree) would inappropriately merge source and sinks without sufficient evidence to do so. This would introduce large deletion events at the start / end of the assemblies. Refcatored code to require 20 bp of overlap in the head or tail nodes, as well as unit tested functions to support this.
This commit is contained in:
parent
2aac9e2782
commit
e916998784
|
|
@ -63,7 +63,14 @@ import java.util.Set;
|
|||
*/
|
||||
public final class SeqGraph extends BaseGraph<SeqVertex> {
|
||||
private final static boolean PRINT_SIMPLIFY_GRAPHS = false;
|
||||
private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5;
|
||||
|
||||
/**
|
||||
* The minimum number of common bp from the prefix (head merging) or suffix (tail merging)
|
||||
* required before we'll merge in such configurations. A large value here is critical to avoid
|
||||
* merging inappropriate head or tail nodes, which introduces large insertion / deletion events
|
||||
* as the merge operation creates a link among the non-linked sink / source vertices
|
||||
*/
|
||||
private final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10;
|
||||
|
||||
/**
|
||||
* Construct an empty SeqGraph
|
||||
|
|
@ -103,15 +110,15 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
//logger.info("simplifyGraph iteration " + i);
|
||||
// iterate until we haven't don't anything useful
|
||||
didSomeWork = false;
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".dot"), 0);
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".1.dot"), 0);
|
||||
didSomeWork |= new MergeDiamonds().transformUntilComplete();
|
||||
didSomeWork |= new MergeTails().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".diamonds_and_tails.dot"), 0);
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".2.diamonds_and_tails.dot"), 0);
|
||||
|
||||
didSomeWork |= new SplitCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".split_suffix.dot"), 0);
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".3.split_suffix.dot"), 0);
|
||||
didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".merge_suffix.dot"), 0);
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".4.merge_suffix.dot"), 0);
|
||||
|
||||
didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete();
|
||||
didSomeWork |= zipLinearChains();
|
||||
|
|
@ -375,7 +382,10 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
|
||||
// actually do the merging, returning true if at least 1 base was successfully split
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, middles);
|
||||
return splitter.splitAndUpdate(top, bottom, 1);
|
||||
if (splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(1))
|
||||
return splitter.splitAndUpdate(top, bottom);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -408,7 +418,11 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
if ( dontModifyGraphEvenIfPossible() ) return true;
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, tails);
|
||||
return splitter.splitAndUpdate(top, null, MIN_SUFFIX_TO_MERGE_TAILS);
|
||||
|
||||
if (splitter.meetsMinMergableSequenceForSuffix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES))
|
||||
return splitter.splitAndUpdate(top, null);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -492,7 +506,10 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
if ( dontModifyGraphEvenIfPossible() ) return true;
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming);
|
||||
return splitter.splitAndUpdate(null, bottom, 1);
|
||||
if (splitter.meetsMinMergableSequenceForPrefix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES))
|
||||
return splitter.splitAndUpdate(null, bottom);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -133,6 +133,14 @@ public class SharedVertexSequenceSplitter {
|
|||
suffixV = prefixAndSuffix.getSecond();
|
||||
}
|
||||
|
||||
/**
|
||||
* Given sequencing that are all equal, does this splitter make those into prefix or suffix nodes?
|
||||
* @return true if we merge equal nodes into prefix nodes or suffix nodes
|
||||
*/
|
||||
protected static boolean prefersPrefixMerging() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple single-function interface to split and then update a graph
|
||||
*
|
||||
|
|
@ -140,20 +148,41 @@ public class SharedVertexSequenceSplitter {
|
|||
*
|
||||
* @param top the top vertex, may be null
|
||||
* @param bottom the bottom vertex, may be null
|
||||
* @param minCommonSequence the minimum prefix or suffix size necessary among the vertices to split up
|
||||
* before we'll go ahead and actually do the splitting. Allows one to determine
|
||||
* whether there's actually any useful splitting to do, as well as protect
|
||||
* yourself against spurious splitting of nodes based on trivial amounts of overall
|
||||
* @return true if some useful splitting was done, false otherwise
|
||||
*/
|
||||
public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom, final int minCommonSequence) {
|
||||
if ( prefixV.length() < minCommonSequence && suffixV.length() < minCommonSequence )
|
||||
return false;
|
||||
public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom) {
|
||||
split();
|
||||
updateGraph(top, bottom);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does either the common suffix or prefix have at least minCommonSequence bases in it?
|
||||
* @param minCommonSequence a minimum length of the common sequence, must be >= 0
|
||||
* @return true if either suffix or prefix length >= minCommonSequence
|
||||
*/
|
||||
public boolean meetsMinMergableSequenceForEitherPrefixOrSuffix(final int minCommonSequence) {
|
||||
return meetsMinMergableSequenceForPrefix(minCommonSequence) || meetsMinMergableSequenceForSuffix(minCommonSequence);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the common prefix have at least minCommonSequence bases in it?
|
||||
* @param minCommonSequence a minimum length of the common sequence, must be >= 0
|
||||
* @return true if prefix length >= minCommonSequence
|
||||
*/
|
||||
public boolean meetsMinMergableSequenceForPrefix(final int minCommonSequence) {
|
||||
return prefixV.length() >= minCommonSequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the common suffix have at least minCommonSequence bases in it?
|
||||
* @param minCommonSequence a minimum length of the common sequence, must be >= 0
|
||||
* @return true if suffix length >= minCommonSequence
|
||||
*/
|
||||
public boolean meetsMinMergableSequenceForSuffix(final int minCommonSequence) {
|
||||
return suffixV.length() >= minCommonSequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually do the splitting up of the vertices
|
||||
*
|
||||
|
|
|
|||
|
|
@ -250,4 +250,45 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "MeetsMinSequenceData")
|
||||
public Object[][] makeMeetsMinSequenceData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final boolean prefixBiased = SharedVertexSequenceSplitter.prefersPrefixMerging();
|
||||
tests.add(new Object[]{Arrays.asList("AC", "AC"), 0, true, true});
|
||||
tests.add(new Object[]{Arrays.asList("AC", "AC"), 1, prefixBiased, ! prefixBiased});
|
||||
tests.add(new Object[]{Arrays.asList("AC", "AC"), 2, prefixBiased, ! prefixBiased});
|
||||
tests.add(new Object[]{Arrays.asList("AC", "AC"), 3, false, false});
|
||||
tests.add(new Object[]{Arrays.asList("A", "AC"), 1, true, false});
|
||||
tests.add(new Object[]{Arrays.asList("A", "AC"), 2, false, false});
|
||||
tests.add(new Object[]{Arrays.asList("AT", "AC"), 1, true, false});
|
||||
tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 1, true, false});
|
||||
tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 2, true, false});
|
||||
tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 3, false, false});
|
||||
tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 1, true, true});
|
||||
tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 2, true, true});
|
||||
tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 3, false, true});
|
||||
tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 4, false, false});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "MeetsMinSequenceData")
|
||||
public void testSplitterCompleteCycle(final List<String> mids, final int minSeqLength, final boolean prefixMeets, final boolean suffixMeets) {
|
||||
final SeqGraph graph = new SeqGraph();
|
||||
|
||||
final SeqVertex top = new SeqVertex("AAAAAAAA");
|
||||
final SeqVertex bot = new SeqVertex("GGGGGGGG");
|
||||
final List<SeqVertex> v = new ArrayList<SeqVertex>();
|
||||
for ( final String s : mids ) { v.add(new SeqVertex(s)); }
|
||||
graph.addVertices(v.toArray(new SeqVertex[]{}));
|
||||
graph.addVertices(top, bot);
|
||||
for ( final SeqVertex vi : v ) { graph.addEdge(top, vi); graph.addEdge(vi, bot); }
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
|
||||
Assert.assertEquals(splitter.meetsMinMergableSequenceForPrefix(minSeqLength), prefixMeets, "Prefix failed");
|
||||
Assert.assertEquals(splitter.meetsMinMergableSequenceForSuffix(minSeqLength), suffixMeets, "Suffix failed");
|
||||
Assert.assertEquals(splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(minSeqLength), suffixMeets || prefixMeets, "Either prefix or suffix failed");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue