From 4d389a823467e355d502ee77056ed4434a04e6e3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 31 Mar 2013 16:57:36 -0400 Subject: [PATCH] Optimizations for HC infrastructure -- outgoingVerticesOf and incomingVerticesOf return a list not a set now, as the corresponding values must be unique since our super directed graph doesn't allow multiple edges between vertices -- Make DeBruijnGraph, SeqGraph, SeqVertex, and DeBruijnVertex all final -- Cache HashCode calculation in BaseVertex -- Better docs before the pruneGraph call --- .../gatk/walkers/haplotypecaller/DeBruijnAssembler.java | 8 ++++++++ .../gatk/walkers/haplotypecaller/graphs/BaseVertex.java | 7 +++++-- .../walkers/haplotypecaller/graphs/DeBruijnGraph.java | 2 +- .../walkers/haplotypecaller/graphs/DeBruijnVertex.java | 2 +- .../gatk/walkers/haplotypecaller/graphs/SeqVertex.java | 2 +- .../haplotypecaller/graphs/SharedSequenceMerger.java | 2 +- .../walkers/haplotypecaller/graphs/BaseGraphUnitTest.java | 6 ++++-- .../walkers/haplotypecaller/graphs/SeqGraphUnitTest.java | 2 +- 8 files changed, 22 insertions(+), 9 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 1fd2b9c00..5d8113212 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -185,6 +185,14 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm + // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect + // TODO -- to anything from one that's actuall has good support along the chain but just happens + // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately + // TODO -- the pruning algorithm really should be an error correction algorithm that knows more + // TODO -- about the structure of the data and can differeniate between an infrequent path but + // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through) + // TODO -- from a error with lots of weight going along another similar path // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive seqGraph.zipLinearChains(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java index f50b4a155..65643a2cc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java @@ -58,6 +58,7 @@ import java.util.Arrays; */ public class BaseVertex { final byte[] sequence; + int cachedHashCode = -1; /** * Create a new sequence vertex with sequence @@ -128,8 +129,10 @@ public class BaseVertex { */ @Override public int hashCode() { - // TODO -- optimization, could compute upfront once and cached in debruijn graph - return Arrays.hashCode(sequence); + if ( cachedHashCode == -1 ) { + cachedHashCode = Arrays.hashCode(sequence); + } + return cachedHashCode; } @Override diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java index 109598029..66085fcad 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java @@ -59,7 +59,7 @@ import java.util.Map; * User: rpoplin * Date: 2/6/13 */ -public class DeBruijnGraph extends BaseGraph { +public final class DeBruijnGraph extends BaseGraph { /** * Create an empty DeBruijnGraph with default kmer size */ diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java index 4d9441efe..c240949d9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java @@ -54,7 +54,7 @@ import com.google.java.contract.Ensures; * User: ebanks, mdepristo * Date: Mar 23, 2011 */ -public class DeBruijnVertex extends BaseVertex { +public final class DeBruijnVertex extends BaseVertex { private final static byte[][] sufficesAsByteArray = new byte[256][]; static { for ( int i = 0; i < sufficesAsByteArray.length; i++ ) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java index cfc2abfdc..f192b54aa 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java @@ -70,7 +70,7 @@ import java.util.Arrays; * @author: depristo * @since 03/2013 */ -public class SeqVertex extends BaseVertex { +public final class SeqVertex extends BaseVertex { private static int idCounter = 0; public final int id; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java index 1c53f2332..28734e505 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java @@ -75,7 +75,7 @@ public class SharedSequenceMerger { if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex " + v); - final Set prevs = graph.incomingVerticesOf(v); + final List prevs = graph.incomingVerticesOf(v); if ( ! canMerge(graph, v, prevs) ) return false; else { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java index 9737f72f5..c829488ba 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java @@ -241,9 +241,11 @@ public class BaseGraphUnitTest extends BaseTest { graph.printGraph(tmp, 10); } - private void assertVertexSetEquals(final Set actual, final SeqVertex ... expected) { + private void assertVertexSetEquals(final Collection actual, final SeqVertex ... expected) { + final Set actualSet = new HashSet(actual); + Assert.assertEquals(actualSet.size(), actual.size(), "Duplicate elements found in vertex list"); final Set expectedSet = expected == null ? Collections.emptySet() : new HashSet(Arrays.asList(expected)); - Assert.assertEquals(actual, expectedSet); + Assert.assertEquals(actualSet, expectedSet); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index 698b83199..ca43ced69 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -58,7 +58,7 @@ import java.util.LinkedList; import java.util.List; public class SeqGraphUnitTest extends BaseTest { - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { public byte[] sequence;