From e9169987843d69fbca6b986f0d342fdb654f43a1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 3 Apr 2013 10:39:45 -0400 Subject: [PATCH] Bugfix for head and tail merging code in SeqGraph -- The previous version of the head merging (and tail merging to a lesser degree) would inappropriately merge source and sinks without sufficient evidence to do so. This would introduce large deletion events at the start / end of the assemblies. Refcatored code to require 20 bp of overlap in the head or tail nodes, as well as unit tested functions to support this. --- .../haplotypecaller/graphs/SeqGraph.java | 33 ++++++++++---- .../graphs/SharedVertexSequenceSplitter.java | 43 ++++++++++++++++--- .../SharedVertexSequenceSplitterUnitTest.java | 41 ++++++++++++++++++ 3 files changed, 102 insertions(+), 15 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index d08c2f211..4cc7aae2a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -63,7 +63,14 @@ import java.util.Set; */ public final class SeqGraph extends BaseGraph { private final static boolean PRINT_SIMPLIFY_GRAPHS = false; - private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5; + + /** + * The minimum number of common bp from the prefix (head merging) or suffix (tail merging) + * required before we'll merge in such configurations. A large value here is critical to avoid + * merging inappropriate head or tail nodes, which introduces large insertion / deletion events + * as the merge operation creates a link among the non-linked sink / source vertices + */ + private final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10; /** * Construct an empty SeqGraph @@ -103,15 +110,15 @@ public final class SeqGraph extends BaseGraph { //logger.info("simplifyGraph iteration " + i); // iterate until we haven't don't anything useful didSomeWork = false; - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".1.dot"), 0); didSomeWork |= new MergeDiamonds().transformUntilComplete(); didSomeWork |= new MergeTails().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".diamonds_and_tails.dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".2.diamonds_and_tails.dot"), 0); didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".split_suffix.dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".3.split_suffix.dot"), 0); didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".merge_suffix.dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".4.merge_suffix.dot"), 0); didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); didSomeWork |= zipLinearChains(); @@ -375,7 +382,10 @@ public final class SeqGraph extends BaseGraph { // actually do the merging, returning true if at least 1 base was successfully split final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, middles); - return splitter.splitAndUpdate(top, bottom, 1); + if (splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(1)) + return splitter.splitAndUpdate(top, bottom); + else + return false; } } @@ -408,7 +418,11 @@ public final class SeqGraph extends BaseGraph { if ( dontModifyGraphEvenIfPossible() ) return true; final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, tails); - return splitter.splitAndUpdate(top, null, MIN_SUFFIX_TO_MERGE_TAILS); + + if (splitter.meetsMinMergableSequenceForSuffix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES)) + return splitter.splitAndUpdate(top, null); + else + return false; } } @@ -492,7 +506,10 @@ public final class SeqGraph extends BaseGraph { if ( dontModifyGraphEvenIfPossible() ) return true; final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming); - return splitter.splitAndUpdate(null, bottom, 1); + if (splitter.meetsMinMergableSequenceForPrefix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES)) + return splitter.splitAndUpdate(null, bottom); + else + return false; } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java index 9834653a6..ca7faa444 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java @@ -133,6 +133,14 @@ public class SharedVertexSequenceSplitter { suffixV = prefixAndSuffix.getSecond(); } + /** + * Given sequencing that are all equal, does this splitter make those into prefix or suffix nodes? + * @return true if we merge equal nodes into prefix nodes or suffix nodes + */ + protected static boolean prefersPrefixMerging() { + return true; + } + /** * Simple single-function interface to split and then update a graph * @@ -140,20 +148,41 @@ public class SharedVertexSequenceSplitter { * * @param top the top vertex, may be null * @param bottom the bottom vertex, may be null - * @param minCommonSequence the minimum prefix or suffix size necessary among the vertices to split up - * before we'll go ahead and actually do the splitting. Allows one to determine - * whether there's actually any useful splitting to do, as well as protect - * yourself against spurious splitting of nodes based on trivial amounts of overall * @return true if some useful splitting was done, false otherwise */ - public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom, final int minCommonSequence) { - if ( prefixV.length() < minCommonSequence && suffixV.length() < minCommonSequence ) - return false; + public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom) { split(); updateGraph(top, bottom); return true; } + /** + * Does either the common suffix or prefix have at least minCommonSequence bases in it? + * @param minCommonSequence a minimum length of the common sequence, must be >= 0 + * @return true if either suffix or prefix length >= minCommonSequence + */ + public boolean meetsMinMergableSequenceForEitherPrefixOrSuffix(final int minCommonSequence) { + return meetsMinMergableSequenceForPrefix(minCommonSequence) || meetsMinMergableSequenceForSuffix(minCommonSequence); + } + + /** + * Does the common prefix have at least minCommonSequence bases in it? + * @param minCommonSequence a minimum length of the common sequence, must be >= 0 + * @return true if prefix length >= minCommonSequence + */ + public boolean meetsMinMergableSequenceForPrefix(final int minCommonSequence) { + return prefixV.length() >= minCommonSequence; + } + + /** + * Does the common suffix have at least minCommonSequence bases in it? + * @param minCommonSequence a minimum length of the common sequence, must be >= 0 + * @return true if suffix length >= minCommonSequence + */ + public boolean meetsMinMergableSequenceForSuffix(final int minCommonSequence) { + return suffixV.length() >= minCommonSequence; + } + /** * Actually do the splitting up of the vertices * diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index 77857c367..0930d497f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -250,4 +250,45 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { } } } + + @DataProvider(name = "MeetsMinSequenceData") + public Object[][] makeMeetsMinSequenceData() { + List tests = new ArrayList(); + + final boolean prefixBiased = SharedVertexSequenceSplitter.prefersPrefixMerging(); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 0, true, true}); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 1, prefixBiased, ! prefixBiased}); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 2, prefixBiased, ! prefixBiased}); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 3, false, false}); + tests.add(new Object[]{Arrays.asList("A", "AC"), 1, true, false}); + tests.add(new Object[]{Arrays.asList("A", "AC"), 2, false, false}); + tests.add(new Object[]{Arrays.asList("AT", "AC"), 1, true, false}); + tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 1, true, false}); + tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 2, true, false}); + tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 3, false, false}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 1, true, true}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 2, true, true}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 3, false, true}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 4, false, false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MeetsMinSequenceData") + public void testSplitterCompleteCycle(final List mids, final int minSeqLength, final boolean prefixMeets, final boolean suffixMeets) { + final SeqGraph graph = new SeqGraph(); + + final SeqVertex top = new SeqVertex("AAAAAAAA"); + final SeqVertex bot = new SeqVertex("GGGGGGGG"); + final List v = new ArrayList(); + for ( final String s : mids ) { v.add(new SeqVertex(s)); } + graph.addVertices(v.toArray(new SeqVertex[]{})); + graph.addVertices(top, bot); + for ( final SeqVertex vi : v ) { graph.addEdge(top, vi); graph.addEdge(vi, bot); } + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + Assert.assertEquals(splitter.meetsMinMergableSequenceForPrefix(minSeqLength), prefixMeets, "Prefix failed"); + Assert.assertEquals(splitter.meetsMinMergableSequenceForSuffix(minSeqLength), suffixMeets, "Suffix failed"); + Assert.assertEquals(splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(minSeqLength), suffixMeets || prefixMeets, "Either prefix or suffix failed"); + } }