diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java index d49b63672..07a6629d7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import java.io.Serializable; +import java.util.Collection; import java.util.Comparator; /** @@ -151,10 +152,39 @@ public class BaseEdge { * multiplicity is the sum * * @param edge the edge to add + * @return this */ - public void add(final BaseEdge edge) { + public BaseEdge add(final BaseEdge edge) { if ( edge == null ) throw new IllegalArgumentException("edge cannot be null"); this.multiplicity += edge.getMultiplicity(); this.isRef = this.isRef || edge.isRef(); + return this; + } + + /** + * Create a new BaseEdge with multiplicity and isRef that's an or of all edges + * + * @param edges a collection of edges to or their isRef values + * @param multiplicity our desired multiplicity + * @return a newly allocated BaseEdge + */ + public static BaseEdge orRef(final Collection edges, final int multiplicity) { + for ( final BaseEdge e : edges ) + if ( e.isRef() ) + return new BaseEdge(true, multiplicity); + return new BaseEdge(false, multiplicity); + } + + /** + * Return a new edge that the max of this and edge + * + * isRef is simply the or of this and edge + * multiplicity is the max + * + * @param edge the edge to max + */ + public BaseEdge max(final BaseEdge edge) { + if ( edge == null ) throw new IllegalArgumentException("edge cannot be null"); + return new BaseEdge(isRef() || edge.isRef(), Math.max(getMultiplicity(), edge.getMultiplicity())); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index c77ec4222..c3f371ec7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -48,10 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; @@ -219,6 +221,15 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph vertices) { + for ( final T v : vertices ) + addVertex(v); + } + /** * Convenience function to add multiple edges to the graph * @param start the first vertex to connect @@ -426,6 +446,23 @@ public class BaseGraph extends DefaultDirectedGraph verticesToRemove = new HashSet(vertexSet()); + final DepthFirstIterator dfi = new DepthFirstIterator(this, getReferenceSourceVertex()); + while ( dfi.hasNext() ) { + final T accessibleFromRefSource = dfi.next(); + // we also want to prune all sinks that aren't the reference sink + if ( ! isNonRefSink(accessibleFromRefSource) ) + verticesToRemove.remove(accessibleFromRefSource); + } + + removeAllVertices(verticesToRemove); + } + protected void pruneGraph( final int pruneFactor ) { final List edgesToRemove = new ArrayList(); for( final BaseEdge e : edgeSet() ) { @@ -525,7 +562,52 @@ public class BaseGraph extends DefaultDirectedGraph edges = getAllEdges(source, target); + return getSingletonEdge(getAllEdges(source, target)); + } + + /** + * Get the incoming edge of v. Requires that there be only one such edge or throws an error + * @param v our vertex + * @return the single incoming edge to v, or null if none exists + */ + public BaseEdge incomingEdgeOf(final T v) { + return getSingletonEdge(incomingEdgesOf(v)); + } + + /** + * Get the outgoing edge of v. Requires that there be only one such edge or throws an error + * @param v our vertex + * @return the single outgoing edge from v, or null if none exists + */ + public BaseEdge outgoingEdgeOf(final T v) { + return getSingletonEdge(outgoingEdgesOf(v)); + } + + /** + * Helper function that gets the a single edge from edges, null if edges is empty, or + * throws an error is edges has more than 1 element + * @param edges a set of edges + * @return a edge + */ + @Requires("edges != null") + private BaseEdge getSingletonEdge(final Collection edges) { + if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges); return edges.isEmpty() ? null : edges.iterator().next(); } + + /** + * Add edge between source -> target if none exists, or add e to an already existing one if present + * + * @param source source vertex + * @param target vertex + * @param e edge to add + */ + public void addOrUpdateEdge(final T source, final T target, final BaseEdge e) { + final BaseEdge prev = getEdge(source, target); + if ( prev != null ) { + prev.add(e); + } else { + addEdge(source, target, e); + } + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java index 93bd4f5c5..a6436f0a3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java @@ -69,10 +69,21 @@ public class BaseVertex { */ public BaseVertex(final byte[] sequence) { if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); - if ( sequence.length == 0 ) throw new IllegalArgumentException("Sequence cannot be empty"); this.sequence = sequence; } + /** + * Does this vertex have an empty sequence? + * + * That is, is it a dummy node that's only present for structural reasons but doesn't actually + * contribute to the sequence of the graph? + * + * @return true if sequence is empty, false otherwise + */ + public boolean isEmpty() { + return length() == 0; + } + /** * Get the length of this sequence * @return a positive integer >= 1 diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 7cf4cc8d3..6aec9c7a5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -173,7 +173,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); graph = graph.errorCorrect(); if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); - graph.cleanNonRefPaths(); final SeqGraph seqGraph = toSeqGraph(graph); @@ -199,6 +198,14 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); seqGraph.simplifyGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR); + + // if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef + // might blow up because there's no reference source node + if ( seqGraph.vertexSet().size() == 1 ) + return seqGraph; + seqGraph.removePathsNotConnectedToRef(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), PRUNE_FACTOR); + return seqGraph; } @@ -274,6 +281,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } + graph.cleanNonRefPaths(); return graph; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java index 7546155a6..4adfe6612 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java @@ -391,4 +391,24 @@ class Path { cigar = initialCigar; } } + + /** + * Tests that this and other have the same score and vertices in the same order with the same seq + * @param other the other path to consider. Cannot be null + * @return true if this and path are equal, false otherwise + */ + public boolean equalScoreAndSequence(final Path other) { + if ( other == null ) throw new IllegalArgumentException("other cannot be null"); + + if ( getScore() != other.getScore() ) + return false; + final List mine = getVertices(); + final List yours = other.getVertices(); + if ( mine.size() == yours.size() ) { // hehehe + for ( int i = 0; i < mine.size(); i++ ) + if ( ! mine.get(i).seqEquals(yours.get(i)) ) + return false; + } + return true; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java index b855390c6..da24a06a4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java @@ -46,13 +46,9 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; -import org.apache.commons.lang.StringUtils; -import java.io.File; -import java.util.*; +import java.util.Set; /** * A graph that contains base sequence at each node @@ -61,6 +57,8 @@ import java.util.*; * @since 03/2013 */ public class SeqGraph extends BaseGraph { + private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5; + /** * Construct an empty SeqGraph */ @@ -86,18 +84,38 @@ public class SeqGraph extends BaseGraph { * in any way the sequences implied by a complex enumeration of all paths through the graph. */ public void simplifyGraph() { + simplifyGraph(Integer.MAX_VALUE); + } + + protected void simplifyGraph(final int maxCycles) { + boolean didSomeWork; + int i = 0; + + // start off with one round of zipping of chains for performance reasons zipLinearChains(); - mergeBranchingNodes(); - zipLinearChains(); + do { + //logger.info("simplifyGraph iteration " + i); + // iterate until we haven't don't anything useful + didSomeWork = false; + //printGraph(new File("simplifyGraph." + i + ".dot"), 0); + didSomeWork |= new MergeDiamonds().transformUntilComplete(); + didSomeWork |= new MergeTails().transformUntilComplete(); + didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); + didSomeWork |= zipLinearChains(); + i++; + } while (didSomeWork && i < maxCycles); } /** * Zip up all of the simple linear chains present in this graph. */ - protected void zipLinearChains() { + protected boolean zipLinearChains() { + boolean foundOne = false; while( zipOneLinearChain() ) { // just keep going until zipOneLinearChain says its done + foundOne = true; } + return foundOne; } /** @@ -147,193 +165,168 @@ public class SeqGraph extends BaseGraph { } /** - * Perform as many branch simplifications and merging operations as possible on this graph, - * modifying it in place. + * Base class for transformation operations that need to iterate over proposed vertices, where + * each proposed vertex is a seed vertex for a potential transformation. + * + * transformUntilComplete will iteratively apply the tryToTransform function on each vertex in the graph + * until no vertex can be found that can be transformed. + * + * Note that in order to eventually terminate tryToTransform must transform the graph such that eventually + * no vertices are candidates for further transformations. */ - protected void mergeBranchingNodes() { - boolean foundNodesToMerge = true; - while( foundNodesToMerge ) { - foundNodesToMerge = false; + private abstract class VertexBasedTransformer { + /** + * For testing purposes we sometimes want to test that can be transformed capabilities are working + * without actually modifying the graph */ + private boolean dontModifyGraphEvenIfPossible = false; - for( final SeqVertex v : vertexSet() ) { - foundNodesToMerge = simplifyDiamondIfPossible(v); - if ( foundNodesToMerge ) - break; - } - } - } + public boolean dontModifyGraphEvenIfPossible() { return dontModifyGraphEvenIfPossible; } + public void setDontModifyGraphEvenIfPossible() { this.dontModifyGraphEvenIfPossible = true; } - /** - * A simple structure that looks like: - * - * v - * / | \ \ - * m1 m2 m3 ... mn - * \ | / / - * b - * - * Only returns true if all outgoing edges of v go to vertices that all only connect to - * a single bottom node, and that all middle nodes have only the single edge - * - * @param v the vertex to test if its the top of a diamond pattern - * @return true if v is the root of a diamond - */ - protected boolean isRootOfDiamond(final SeqVertex v) { - final Set ve = outgoingEdgesOf(v); - if ( ve.size() <= 1 ) - return false; + /** + * Merge until the graph has no vertices that are candidates for merging + */ + public boolean transformUntilComplete() { + boolean didAtLeastOneTranform = false; + boolean foundNodesToMerge = true; + while( foundNodesToMerge ) { + foundNodesToMerge = false; - SeqVertex bottom = null; - for ( final BaseEdge e : ve ) { - final SeqVertex mi = getEdgeTarget(e); - - // all nodes must have at least 1 connection - if ( outDegreeOf(mi) < 1 ) - return false; - - // can only have 1 incoming node, the root vertex - if ( inDegreeOf(mi) != 1 ) - return false; - - // make sure that all outgoing vertices of mi go only to the bottom node - for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { - if ( bottom == null ) - bottom = mt; - else if ( ! bottom.equals(mt) ) - return false; - } - } - - // bottom has some connections coming in from other nodes, don't allow - if ( inDegreeOf(bottom) != ve.size() ) - return false; - - return true; - } - - /** - * Return the longest suffix of bases shared among all provided vertices - * - * For example, if the vertices have sequences AC, CC, and ATC, this would return - * a single C. However, for ACC and TCC this would return CC. And for AC and TG this - * would return null; - * - * @param middleVertices a non-empty set of vertices - * @return - */ - @Requires("!middleVertices.isEmpty()") - private byte[] commonSuffixOfEdgeTargets(final Set middleVertices) { - final String[] kmers = new String[middleVertices.size()]; - - int i = 0; - for ( final SeqVertex v : middleVertices ) { - kmers[i++] = (StringUtils.reverse(v.getSequenceString())); - } - - final String commonPrefix = StringUtils.getCommonPrefix(kmers); - return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes(); - } - - /** - * Get the node that is the bottom of a diamond configuration in the graph starting at top - * - * @param top - * @return - */ - @Requires("top != null") - @Ensures({"result != null"}) - private SeqVertex getDiamondBottom(final SeqVertex top) { - final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next(); - final SeqVertex middle = getEdgeTarget(topEdge); - final BaseEdge middleEdge = outgoingEdgesOf(middle).iterator().next(); - return getEdgeTarget(middleEdge); - } - - /** - * Get the set of vertices that are in the middle of a diamond starting at top - * @param top - * @return - */ - @Requires("top != null") - @Ensures({"result != null", "!result.isEmpty()"}) - final Set getMiddleVertices(final SeqVertex top) { - final Set middles = new HashSet(); - for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) { - middles.add(getEdgeTarget(topToMiddle)); - } - return middles; - } - - /** - * Simply a diamond configuration in the current graph starting at top, if possible - * - * If top is actually the top of a diamond that can be simplified (i.e., doesn't have any - * random edges or other structure that would cause problems with the transformation), then this code - * performs the following transformation on this graph (modifying it): - * - * A -> M1 -> B, A -> M2 -> B, A -> Mn -> B - * - * becomes - * - * A -> M1' -> B', A -> M2' -> B', A -> Mn' -> B' - * - * where B' is composed of the longest common suffix of all Mi nodes + B, and Mi' are each - * middle vertex without their shared suffix. - * - * @param top a proposed vertex in this graph that might start a diamond (but doesn't have to) - * @return true top actually starts a diamond and it could be simplified - */ - private boolean simplifyDiamondIfPossible(final SeqVertex top) { - if ( ! isRootOfDiamond(top) ) - return false; - - final SeqVertex diamondBottom = getDiamondBottom(top); - final Set middleVertices = getMiddleVertices(top); - final List verticesToRemove = new LinkedList(); - final List edgesToRemove = new LinkedList(); - - // all of the edges point to the same sink, so it's time to merge - final byte[] commonSuffix = commonSuffixOfEdgeTargets(middleVertices); - if ( commonSuffix != null ) { - final BaseEdge botToNewBottom = new BaseEdge(false, 0); - final BaseEdge elimMiddleNodeEdge = new BaseEdge(false, 0); - final SeqVertex newBottomV = new SeqVertex(commonSuffix); - addVertex(newBottomV); - - for ( final SeqVertex middle : middleVertices ) { - final SeqVertex withoutSuffix = middle.withoutSuffix(commonSuffix); - final BaseEdge topToMiddleEdge = getEdge(top, middle); - final BaseEdge middleToBottomE = getEdge(middle, diamondBottom); - - // clip out the two edges, since we'll be replacing them later - edgesToRemove.add(topToMiddleEdge); - edgesToRemove.add(middleToBottomE); - - if ( withoutSuffix != null ) { // this node is a deletion - addVertex(withoutSuffix); - // update edge from top -> middle to be top -> without suffix - addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge)); - addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE)); - } else { - // this middle node is == the common suffix, wo we're removing the edge - elimMiddleNodeEdge.add(topToMiddleEdge); + for( final SeqVertex v : vertexSet() ) { + foundNodesToMerge = tryToTransform(v); + if ( foundNodesToMerge ) { + didAtLeastOneTranform = true; + break; + } } - // include the ref and multi of mid -> bot in our edge from new bot -> bot - botToNewBottom.add(middleToBottomE); - verticesToRemove.add(middle); } - // add an edge from top to new bottom, because some middle nodes were removed - if ( elimMiddleNodeEdge.getMultiplicity() > 0 ) - addEdge(top, newBottomV, elimMiddleNodeEdge); + return didAtLeastOneTranform; + } - addEdge(newBottomV, diamondBottom, botToNewBottom); + /** + * Merge, if possible, seeded on the vertex v + * @param v the proposed seed vertex to merge + * @return true if some useful merging happened, false otherwise + */ + abstract boolean tryToTransform(final SeqVertex v); + } - removeAllEdges(edgesToRemove); - removeAllVertices(verticesToRemove); - return true; - } else { - return false; + /** + * Merge diamond configurations: + * + * Performance the transformation: + * + * { A -> x + S_i + y -> Z } + * + * goes to: + * + * { A -> x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + */ + protected class MergeDiamonds extends VertexBasedTransformer { + @Override + protected boolean tryToTransform(final SeqVertex top) { + final Set middles = outgoingVerticesOf(top); + if ( middles.size() <= 1 ) + // we can only merge if there's at least two middle nodes + return false; + + SeqVertex bottom = null; + for ( final SeqVertex mi : middles ) { + // all nodes must have at least 1 connection + if ( outDegreeOf(mi) < 1 ) + return false; + + // can only have 1 incoming node, the root vertex + if ( inDegreeOf(mi) != 1 ) + return false; + + // make sure that all outgoing vertices of mi go only to the bottom node + for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { + if ( bottom == null ) + bottom = mt; + else if ( ! bottom.equals(mt) ) + return false; + } + } + + // bottom has some connections coming in from other nodes, don't allow + if ( inDegreeOf(bottom) != middles.size() ) + return false; + + if ( dontModifyGraphEvenIfPossible() ) return true; + + // actually do the merging, returning true if at least 1 base was successfully split + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, middles); + return splitter.splitAndUpdate(top, bottom, 1); + } + } + + /** + * Merge tail configurations: + * + * Performs the transformation: + * + * { A -> x + S_i + y } + * + * goes to: + * + * { A -> x -> S_i -> y } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no bottom node is required + */ + protected class MergeTails extends VertexBasedTransformer { + @Override + protected boolean tryToTransform(final SeqVertex top) { + final Set tails = outgoingVerticesOf(top); + if ( tails.size() <= 1 ) + return false; + + for ( final SeqVertex t : tails ) + if ( ! isSink(t) || inDegreeOf(t) > 1 ) + return false; + + if ( dontModifyGraphEvenIfPossible() ) return true; + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, tails); + return splitter.splitAndUpdate(top, null, MIN_SUFFIX_TO_MERGE_TAILS); + } + } + + /** + * Merge headless configurations: + * + * Performs the transformation: + * + * { x + S_i + y -> Z } + * + * goes to: + * + * { x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no top node is required + */ + protected class MergeHeadlessIncomingSources extends VertexBasedTransformer { + @Override + boolean tryToTransform(final SeqVertex bottom) { + final Set incoming = incomingVerticesOf(bottom); + if ( incoming.size() <= 1 ) + return false; + + for ( final SeqVertex inc : incoming ) + if ( ! isSource(inc) || outDegreeOf(inc) > 1 ) + return false; + + if ( dontModifyGraphEvenIfPossible() ) return true; + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming); + return splitter.splitAndUpdate(null, bottom, 1); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java index b45ac0c34..523312dcf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java @@ -150,4 +150,19 @@ public class SeqVertex extends BaseVertex { final int prefixSize = sequence.length - suffix.length; return prefixSize > 0 ? new SeqVertex(Arrays.copyOf(sequence, prefixSize)) : null; } + + /** + * Return a new SeqVertex derived from this one but not including prefix or suffix bases + * + * @param prefix the previx bases to remove + * @param suffix the suffix bases to remove from this vertex + * @return a newly allocated SeqVertex + */ + @Requires("Utils.endsWith(sequence, suffix)") + public SeqVertex withoutPrefixAndSuffix(final byte[] prefix, final byte[] suffix) { + final int start = prefix.length; + final int length = sequence.length - suffix.length - prefix.length; + final int stop = start + length; + return length > 0 ? new SeqVertex(Arrays.copyOfRange(sequence, start, stop)) : null; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java new file mode 100644 index 000000000..e0501da52 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java @@ -0,0 +1,341 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.*; + +/** + * Split a collection of middle nodes in a graph into their shared prefix and suffix values + * + * This code performs the following transformation. Suppose I have a set of vertices V, such + * that each vertex is composed of sequence such that + * + * Vi = prefix + seq_i + suffix + * + * where prefix and suffix are shared sequences across all vertices V + * + * This algorithm creates a new SeqGraph with the following configuration + * + * prefix -> has outgoing edges to all seq_i + * suffix -> has incoming edges for all seq_i + * + * There are a few special cases that must be handled. First, Vi could be simply + * == to the prefix or the suffix. These generate direct connections between + * the prefix and suffix nodes, and they are handled internally by the algorithm. + * + * Note that for convenience, we will always create newTop and newBottom nodes, but + * these may be empty node (i.e., they contain no sequence). That allows them to be + * trivially merged, if desired, when the graph is incorporated into an overall + * graph. + * + * The product of this operation is a SeqGraph that contains the split. There's a + * function to merge reconnect this graph into the graph that contains the middle nodes + * + * The process guarentees a few things about the output: + * + * -- Preserves the paths and weights among all vertices + * + * It produces a graph that has some unusual properties + * + * -- May add nodes with no sequence (isEmpty() == true) to preserve connectivity among the graph + * -- May introduce edges with no multiplicity to preserve paths through the graph + * + * The overall workflow of using this class is simple: + * + * find vertices V in graph that you want to split out + * s = new SharedVertexSequenceSplitter(graph, V) + * s.updateGraph(graph) + * + * to update the graph with the modifications created by this splitter + * + * User: depristo + * Date: 3/22/13 + * Time: 8:31 AM + */ +public class SharedVertexSequenceSplitter { + final private SeqGraph outer; + final protected SeqVertex prefixV, suffixV; + final protected Collection toSplits; + + // updated in split routine + protected SeqGraph splitGraph = null; + protected Collection newMiddles = null; + protected List edgesToRemove = null; + + /** + * Create a new graph that contains the vertices in toSplitsArg with their shared suffix and prefix + * sequences extracted out. + * + * @param graph the graph containing the vertices in toSplitsArg + * @param toSplitsArg a collection of vertices to split. Must be contained within graph, and have only connections + * from a single shared top and/or bottom node + */ + public SharedVertexSequenceSplitter(final SeqGraph graph, final Collection toSplitsArg) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( toSplitsArg == null ) throw new IllegalArgumentException("toSplitsArg cannot be null"); + if ( toSplitsArg.size() < 2 ) throw new IllegalArgumentException("Can only split at least 2 vertices but only got " + toSplitsArg); + if ( ! graph.vertexSet().containsAll(toSplitsArg) ) throw new IllegalArgumentException("graph doesn't contain all of the vertices to split"); + + this.outer = graph; + this.toSplits = toSplitsArg; + + // all of the edges point to the same sink, so it's time to merge + final Pair prefixAndSuffix = commonPrefixAndSuffixOfVertices(toSplits); + prefixV = prefixAndSuffix.getFirst(); + suffixV = prefixAndSuffix.getSecond(); + } + + /** + * Simple single-function interface to split and then update a graph + * + * @see #updateGraph(SeqVertex, SeqVertex) for a full description of top and bottom + * + * @param top the top vertex, may be null + * @param bottom the bottom vertex, may be null + * @param minCommonSequence the minimum prefix or suffix size necessary among the vertices to split up + * before we'll go ahead and actually do the splitting. Allows one to determine + * whether there's actually any useful splitting to do, as well as protect + * yourself against spurious splitting of nodes based on trivial amounts of overall + * @return true if some useful splitting was done, false otherwise + */ + public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom, final int minCommonSequence) { + if ( prefixV.length() < minCommonSequence && suffixV.length() < minCommonSequence ) + return false; + split(); + updateGraph(top, bottom); + return true; + } + + /** + * Actually do the splitting up of the vertices + * + * Must be called before calling updateGraph + */ + public void split() { + splitGraph = new SeqGraph(); + newMiddles = new LinkedList(); + edgesToRemove = new LinkedList(); + + splitGraph.addVertices(prefixV, suffixV); + + for ( final SeqVertex mid : toSplits ) { + final BaseEdge toMid = processEdgeToRemove(mid, outer.incomingEdgeOf(mid)); + final BaseEdge fromMid = processEdgeToRemove(mid, outer.outgoingEdgeOf(mid)); + + final SeqVertex remaining = mid.withoutPrefixAndSuffix(prefixV.getSequence(), suffixV.getSequence()); + if ( remaining != null ) { + // there's some sequence prefix + seq + suffix, so add the node and make edges + splitGraph.addVertex(remaining); + newMiddles.add(remaining); + // update edge from top -> middle to be top -> without suffix + splitGraph.addEdge(prefixV, remaining, toMid); + splitGraph.addEdge(remaining, suffixV, fromMid); + } else { + // prefix + suffix completely explain this node + splitGraph.addOrUpdateEdge(prefixV, suffixV, new BaseEdge(toMid).add(fromMid)); + } + } + } + + /** + * Update graph outer, replacing the previous middle vertices that were split out with the new + * graph structure of the split, linking this subgraph into the graph at top and bot (the + * vertex connecting the middle nodes and the vertex outgoing of all middle node) + * + * @param top an optional top node that must have outgoing edges to all split vertices. If null, this subgraph + * will be added without any incoming edges + * @param bot an optional bottom node that must have incoming edges to all split vertices. If null, this subgraph + * will be added without any outgoing edges to the rest of the graph + */ + public void updateGraph(final SeqVertex top, final SeqVertex bot) { + if ( ! outer.vertexSet().containsAll(toSplits) ) throw new IllegalArgumentException("graph doesn't contain all of the original vertices to split"); + if ( top == null && bot == null ) throw new IllegalArgumentException("Cannot update graph without at least one top or bot vertex, but both were null"); + if ( top != null && ! outer.containsVertex(top) ) throw new IllegalArgumentException("top " + top + " not found in graph " + outer); + if ( bot != null && ! outer.containsVertex(bot) ) throw new IllegalArgumentException("bot " + bot + " not found in graph " + outer); + if ( splitGraph == null ) throw new IllegalStateException("Cannot call updateGraph until split() has been called"); + + outer.removeAllVertices(toSplits); + outer.removeAllEdges(edgesToRemove); + + outer.addVertices(newMiddles); + + final boolean hasPrefixSuffixEdge = splitGraph.getEdge(prefixV, suffixV) != null; + final boolean hasOnlyPrefixSuffixEdges = hasPrefixSuffixEdge && splitGraph.outDegreeOf(prefixV) == 1; + final boolean needPrefixNode = ! prefixV.isEmpty() || (top == null && ! hasOnlyPrefixSuffixEdges); + final boolean needSuffixNode = ! suffixV.isEmpty() || (bot == null && ! hasOnlyPrefixSuffixEdges); + + // if prefix / suffix are needed, keep them + final SeqVertex topForConnect = needPrefixNode ? prefixV : top; + final SeqVertex botForConnect = needSuffixNode ? suffixV : bot; + + if ( needPrefixNode ) { + outer.addVertex(prefixV); + if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0)); + } + + if ( needSuffixNode ) { + outer.addVertex(suffixV); + if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0)); + } + + if ( topForConnect != null ) { + for ( final BaseEdge e : splitGraph.outgoingEdgesOf(prefixV) ) { + final SeqVertex target = splitGraph.getEdgeTarget(e); + + if ( target == suffixV ) { // going straight from prefix -> suffix + if ( botForConnect != null ) + outer.addEdge(topForConnect, botForConnect, e); + } else { + outer.addEdge(topForConnect, target, e); + } + } + } + + if ( botForConnect != null ) { + for ( final BaseEdge e : splitGraph.incomingEdgesOf(suffixV) ) { + outer.addEdge(splitGraph.getEdgeSource(e), botForConnect, e); + } + } + } + + /** + * Return the longest suffix of bases shared among all provided vertices + * + * For example, if the vertices have sequences AC, CC, and ATC, this would return + * a single C. However, for ACC and TCC this would return CC. And for AC and TG this + * would return null; + * + * @param middleVertices a non-empty set of vertices + * @return + */ + @Requires("!middleVertices.isEmpty()") + protected static Pair commonPrefixAndSuffixOfVertices(final Collection middleVertices) { + final List kmers = new ArrayList(middleVertices.size()); + + int min = Integer.MAX_VALUE; + for ( final SeqVertex v : middleVertices ) { + kmers.add(v.getSequence()); + min = Math.min(min, v.getSequence().length); + } + + final int prefixLen = compPrefixLen(kmers, min); + final int suffixLen = compSuffixLen(kmers, min - prefixLen); + + final byte[] kmer = kmers.get(0); + final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen); + final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length); + return new Pair(new SeqVertex(prefix), new SeqVertex(suffix)); + } + + /** + * Compute the maximum shared prefix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the start of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compPrefixLen(final List listOfBytes, final int minLength) { + for ( int i = 0; i < minLength; i++ ) { + final byte b = listOfBytes.get(0)[i]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[i] ) + return i; + } + } + + return minLength; + } + + /** + * Compute the maximum shared suffix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the end of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compSuffixLen(final List listOfBytes, final int minLength) { + for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) { + final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] ) + return suffixLen; + } + } + return minLength; + } + + /** + * Helper function that returns an edge that we should use for splitting + * + * If e is null, creates a new 0 multiplicity edge, set to ref is any edges to V are ref + * If e is not null, returns a new copy of e, and schedules e for removal + * + * @param e a non-null edge + * @return a non-null edge + */ + @Requires("v != null") + @Ensures("result != null") + private BaseEdge processEdgeToRemove(final SeqVertex v, final BaseEdge e) { + if ( e == null ) { + // there's no edge, so we return a newly allocated one and don't schedule e for removal + // the weight must be 0 to preserve sum through the diamond + return new BaseEdge(outer.isReferenceNode(v), 0); + } else { + // schedule edge for removal, and return a freshly allocated one for our graph to use + edgesToRemove.add(e); + return new BaseEdge(e); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java index cd27c7183..8f682d474 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java @@ -68,9 +68,10 @@ public class BaseVertexUnitTest extends BaseTest { new BaseVertex((byte[])null); } - @Test(expectedExceptions = IllegalArgumentException.class) + @Test() public void testCreationEmptySeq() { - new BaseVertex(new byte[0]); + final BaseVertex v = new BaseVertex(new byte[0]); + Assert.assertTrue(v.isEmpty(), "Version with length == 0 should be empty"); } @Test diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java index 83a4f4c50..6b6826e45 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java @@ -51,7 +51,6 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -181,7 +180,9 @@ public class SeqGraphUnitTest extends BaseTest { @Test(dataProvider = "IsDiamondData", enabled = true) public void testIsDiamond(final SeqGraph graph, final SeqVertex v, final boolean isRootOfDiamond) { - Assert.assertEquals(graph.isRootOfDiamond(v), isRootOfDiamond); + final SeqGraph.MergeDiamonds merger = graph.new MergeDiamonds(); + merger.setDontModifyGraphEvenIfPossible(); + Assert.assertEquals(merger.tryToTransform(v), isRootOfDiamond); } @DataProvider(name = "MergingData") @@ -267,7 +268,7 @@ public class SeqGraphUnitTest extends BaseTest { tests.add(new Object[]{graph.clone(), expected.clone()}); } - { + { // all the nodes -> lots of merging and motion of nodes final SeqGraph all = new SeqGraph(); all.addVertices(pre1, pre2, top, middle1, middle2, bottom, tail1, tail2); all.addEdges(pre1, top, middle1, bottom, tail1); @@ -277,9 +278,13 @@ public class SeqGraphUnitTest extends BaseTest { final SeqVertex newMiddle1 = new SeqVertex("G"); final SeqVertex newMiddle2 = new SeqVertex("T"); final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString()); - expected.addVertices(pre1, pre2, top, newMiddle1, newMiddle2, newBottom, tail1, tail2); - expected.addEdges(pre1, top, newMiddle1, newBottom, tail1); - expected.addEdges(pre2, top, newMiddle2, newBottom, tail2); + final SeqVertex newTop = new SeqVertex("A"); + final SeqVertex newTopDown1 = new SeqVertex("G"); + final SeqVertex newTopDown2 = new SeqVertex("C"); + final SeqVertex newTopBottomMerged = new SeqVertex("TA"); + expected.addVertices(newTop, newTopDown1, newTopDown2, newTopBottomMerged, newMiddle1, newMiddle2, newBottom, tail1, tail2); + expected.addEdges(newTop, newTopDown1, newTopBottomMerged, newMiddle1, newBottom, tail1); + expected.addEdges(newTop, newTopDown2, newTopBottomMerged, newMiddle2, newBottom, tail2); tests.add(new Object[]{all.clone(), expected.clone()}); } @@ -309,7 +314,12 @@ public class SeqGraphUnitTest extends BaseTest { @Test(dataProvider = "MergingData", enabled = true) public void testMerging(final SeqGraph graph, final SeqGraph expected) { final SeqGraph merged = (SeqGraph)graph.clone(); - merged.simplifyGraph(); + merged.simplifyGraph(1); +// if ( ! SeqGraph.graphEquals(merged, expected) ) { +// graph.printGraph(new File("graph.dot"), 0); +// merged.printGraph(new File("merged.dot"), 0); +// expected.printGraph(new File("expected.dot"), 0); +// } Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); } @@ -332,13 +342,9 @@ public class SeqGraphUnitTest extends BaseTest { graph.addEdge(mid1, bot, new BaseEdge(true, 1)); final SeqGraph expected = new SeqGraph(); - expected.addVertices(top, mid1, bot); - expected.addEdge(top, mid1, new BaseEdge(true, 2)); - expected.addEdge(mid1, bot, new BaseEdge(true, 2)); - + expected.addVertex(new SeqVertex("AACTC")); final SeqGraph actual = ((SeqGraph)graph.clone()); - actual.mergeBranchingNodes(); - - Assert.assertTrue(BaseGraph.graphEquals(actual, expected)); + actual.simplifyGraph(); + Assert.assertTrue(BaseGraph.graphEquals(actual, expected), "Wrong merging result after complete merging"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java new file mode 100644 index 000000000..52ab36064 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java @@ -0,0 +1,253 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class SharedVertexSequenceSplitterUnitTest extends BaseTest { + private final static boolean PRINT_GRAPHS = false; + + @DataProvider(name = "PrefixSuffixData") + public Object[][] makePrefixSuffixData() { + List tests = new ArrayList(); + + tests.add(new Object[]{Arrays.asList("A", "C"), 0, 0}); + tests.add(new Object[]{Arrays.asList("C", "C"), 1, 0}); + tests.add(new Object[]{Arrays.asList("ACT", "AGT"), 1, 1}); + tests.add(new Object[]{Arrays.asList("ACCT", "AGT"), 1, 1}); + tests.add(new Object[]{Arrays.asList("ACT", "ACT"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACT"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACTG"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACTGA"), 3, 1}); + tests.add(new Object[]{Arrays.asList("GCTGA", "ACTGA"), 0, 4}); + + tests.add(new Object[]{Arrays.asList("A", "C", "A"), 0, 0}); + tests.add(new Object[]{Arrays.asList("A", "A", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("A", "AA", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("A", "ACA", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACT"), 2, 1}); + tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACGT"), 2, 1}); + tests.add(new Object[]{Arrays.asList("AAAT", "AAA", "CAAA"), 0, 0}); + tests.add(new Object[]{Arrays.asList("AACTTT", "AAGTTT", "AAGCTTT"), 2, 3}); + tests.add(new Object[]{Arrays.asList("AAA", "AAA", "CAAA"), 0, 3}); + tests.add(new Object[]{Arrays.asList("AAA", "AAA", "AAA"), 3, 0}); + + tests.add(new Object[]{Arrays.asList("AC", "ACA", "AC"), 2, 0}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testPrefixSuffix(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final List bytes = new ArrayList(); + int min = Integer.MAX_VALUE; + for ( final String s : strings ) { + bytes.add(s.getBytes()); + min = Math.min(min, s.length()); + } + + final int actualPrefixLen = SharedVertexSequenceSplitter.compPrefixLen(bytes, min); + Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test"); + + final int actualSuffixLen = SharedVertexSequenceSplitter.compSuffixLen(bytes, min - actualPrefixLen); + Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test"); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testPrefixSuffixVertices(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final List v = new ArrayList(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + + final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); + final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); + + final Pair result = SharedVertexSequenceSplitter.commonPrefixAndSuffixOfVertices(v); + Assert.assertEquals(result.getFirst().getSequenceString(), expectedPrefix, "Failed suffix test"); + Assert.assertEquals(result.getSecond().getSequenceString(), expectedSuffix, "Failed suffix test"); + + Assert.assertEquals(result.getFirst().isEmpty(), expectedPrefix.isEmpty()); + Assert.assertEquals(result.getSecond().isEmpty(), expectedSuffix.isEmpty()); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testSplitter(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final SeqGraph graph = new SeqGraph(); + + final List v = new ArrayList(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + + graph.addVertices(v.toArray(new SeqVertex[]{})); + + final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); + final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + splitter.split(); +// splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".dot"), 0); + + Assert.assertEquals(splitter.prefixV.getSequenceString(), expectedPrefix); + Assert.assertEquals(splitter.suffixV.getSequenceString(), expectedSuffix); + + Assert.assertTrue(splitter.splitGraph.outDegreeOf(splitter.prefixV) <= strings.size()); + Assert.assertEquals(splitter.splitGraph.inDegreeOf(splitter.prefixV), 0); + + Assert.assertTrue(splitter.splitGraph.inDegreeOf(splitter.suffixV) <= strings.size()); + Assert.assertEquals(splitter.splitGraph.outDegreeOf(splitter.suffixV), 0); + + for ( final SeqVertex mid : splitter.newMiddles ) { + Assert.assertNotNull(splitter.splitGraph.getEdge(splitter.prefixV, mid)); + Assert.assertNotNull(splitter.splitGraph.getEdge(mid, splitter.suffixV)); + } + } + + @DataProvider(name = "CompleteCycleData") + public Object[][] makeCompleteCycleData() { + List tests = new ArrayList(); + + for ( final boolean hasTop : Arrays.asList(true, false) ) { + for ( final boolean hasBot : Arrays.asList(true, false) ) { + if ( ! hasTop && ! hasBot ) continue; + tests.add(new Object[]{Arrays.asList("A", "A"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "AC"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "CA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AC", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AT", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATAA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATAACA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAA", "AAA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "AAA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "CCCAAA"), hasTop, hasBot}); + + tests.add(new Object[]{Arrays.asList("A", "A", "A"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "A", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "C", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AC", "C", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CA", "C", "C"), hasTop, hasBot}); + // all merged + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGA"), hasTop, hasBot}); + // prefix and suffix + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "ACA"), hasTop, hasBot}); + // 2 -> prefix, leave C + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGAC"), hasTop, hasBot}); + // 2 -> prefix, leave CCC + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGACCC"), hasTop, hasBot}); + // 2 -> suffix, leave A/T + tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AAGA"), hasTop, hasBot}); + // 2 -> suffix, leave T, delete 1 + tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AGA"), hasTop, hasBot}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CompleteCycleData") + public void testSplitterCompleteCycle(final List strings, final boolean hasTop, final boolean hasBot) { + final SeqGraph graph = new SeqGraph(); + + int edgeWeight = 1; + final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null; + final SeqVertex bot = hasBot ? new SeqVertex("GGGGGGGG") : null; + final List v = new ArrayList(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + graph.addVertices(v.toArray(new SeqVertex[]{})); + final SeqVertex first = v.get(0); + + if ( hasTop ) { + graph.addVertex(top); + for ( final SeqVertex vi : v ) + graph.addEdge(top, vi, new BaseEdge(vi == first, edgeWeight++)); + } + + if ( hasBot ) { + graph.addVertex(bot); + for ( final SeqVertex vi : v ) + graph.addEdge(vi, bot, new BaseEdge(vi == first, edgeWeight++)); + } + + final Set haplotypes = new HashSet(); + final List> originalPaths = new KBestPaths().getKBestPaths((SeqGraph)graph.clone()); + for ( final Path path : originalPaths ) + haplotypes.add(new String(path.getBases())); + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + splitter.split(); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0); + if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0); + splitter.updateGraph(top, bot); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0); + + final List> splitPaths = new KBestPaths().getKBestPaths(graph); + for ( final Path path : splitPaths ) { + final String h = new String(path.getBases()); + Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); + } + + if ( splitPaths.size() == originalPaths.size() ) { + for ( int i = 0; i < originalPaths.size(); i++ ) { + Assert.assertTrue(splitPaths.get(i).equalScoreAndSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); + } + } + } +}