HaplotypeCaller instructure cleanup and unit testing
-- UnitTest for isRootOfDiamond along with key bugfix detected while testing -- Fix up the equals methods in BaseEdge. Now called hasSameSourceAndTarget and seqEquals. A much more meaningful naming -- Generalize graphEquals to use seqEquals, so it works equally well with Debruijn and SeqGraphs -- Add BaseVertex method called seqEquals that returns true if two BaseVertex objects have the same sequence -- Reorganize SeqGraph mergeNodes into a single master function that does zipping, branch merging, and zipping again, rather than having this done in the DeBruijnAssembler itself -- Massive expansion of the SeqGraph unit tests. We now really test out the zipping and branch merging code. -- Near final cleanup of the current codebase -- DeBruijnVertex cleanup and optimizations. Since kmer graphs don't allow sequences longer than the kmer size, the suffix is always a byte, not a byte[]. Optimize the code to make use of this constraint
This commit is contained in:
parent
2e36f15861
commit
5226b24a11
|
|
@ -116,14 +116,21 @@ public class BaseEdge {
|
|||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
// For use when comparing edges pulled from the same graph
|
||||
public <T extends BaseVertex> boolean equals( final BaseGraph<T> graph, final BaseEdge edge ) {
|
||||
/**
|
||||
* Does this and edge have the same source and target vertices in graph?
|
||||
*
|
||||
* @param graph the graph containing both this and edge
|
||||
* @param edge our comparator edge
|
||||
* @param <T>
|
||||
* @return true if we have the same source and target vertices
|
||||
*/
|
||||
public <T extends BaseVertex> boolean hasSameSourceAndTarget(final BaseGraph<T> graph, final BaseEdge edge) {
|
||||
return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
// For use when comparing edges across graphs!
|
||||
public <T extends BaseVertex> boolean equals( final BaseGraph<T> graph, final BaseEdge edge, final BaseGraph<T> graph2 ) {
|
||||
return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
|
||||
public <T extends BaseVertex> boolean seqEquals( final BaseGraph<T> graph, final BaseEdge edge, final BaseGraph<T> graph2 ) {
|
||||
return (graph.getEdgeSource(this).seqEquals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).seqEquals(graph2.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -310,6 +310,19 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
|
|||
addVertex(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function to add multiple edges to the graph
|
||||
* @param start the first vertex to connect
|
||||
* @param remaining all additional vertices to connect
|
||||
*/
|
||||
public void addEdges(final T start, final T ... remaining) {
|
||||
T prev = start;
|
||||
for ( final T next : remaining ) {
|
||||
addEdge(prev, next);
|
||||
prev = next;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of vertices connected by outgoing edges of V
|
||||
* @param v a non-null vertex
|
||||
|
|
@ -451,28 +464,50 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
|
|||
}
|
||||
}
|
||||
|
||||
// for ( final T remove : toRemove )
|
||||
// logger.info("Cleaning up nodes not attached to any reference node: " + remove.toString());
|
||||
|
||||
removeAllVertices(toRemove);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Semi-lenient comparison of two graphs, truing true if g1 and g2 have similar structure
|
||||
*
|
||||
* By similar this means that both graphs have the same number of vertices, where each vertex can find
|
||||
* a vertex in the other graph that's seqEqual to it. A similar constraint applies to the edges,
|
||||
* where all edges in g1 must have a corresponding edge in g2 where both source and target vertices are
|
||||
* seqEqual
|
||||
*
|
||||
* @param g1 the first graph to compare
|
||||
* @param g2 the second graph to compare
|
||||
* @param <T> the type of the nodes in those graphs
|
||||
* @return true if g1 and g2 are equals
|
||||
*/
|
||||
public static <T extends BaseVertex> boolean graphEquals(final BaseGraph<T> g1, BaseGraph<T> g2) {
|
||||
if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) {
|
||||
final Set<T> vertices1 = g1.vertexSet();
|
||||
final Set<T> vertices2 = g2.vertexSet();
|
||||
final Set<BaseEdge> edges1 = g1.edgeSet();
|
||||
final Set<BaseEdge> edges2 = g2.edgeSet();
|
||||
|
||||
if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() )
|
||||
return false;
|
||||
|
||||
for ( final T v1 : vertices1 ) {
|
||||
boolean found = false;
|
||||
for ( final T v2 : vertices2 )
|
||||
found = found || v1.getSequenceString().equals(v2.getSequenceString());
|
||||
if ( ! found ) return false;
|
||||
}
|
||||
for( BaseEdge e1 : g1.edgeSet() ) {
|
||||
|
||||
for( final BaseEdge e1 : g1.edgeSet() ) {
|
||||
boolean found = false;
|
||||
for( BaseEdge e2 : g2.edgeSet() ) {
|
||||
if( e1.equals(g1, e2, g2) ) { found = true; break; }
|
||||
if( e1.seqEquals(g1, e2, g2) ) { found = true; break; }
|
||||
}
|
||||
if( !found ) { return false; }
|
||||
}
|
||||
for( BaseEdge e2 : g2.edgeSet() ) {
|
||||
for( final BaseEdge e2 : g2.edgeSet() ) {
|
||||
boolean found = false;
|
||||
for( BaseEdge e1 : g1.edgeSet() ) {
|
||||
if( e2.equals(g2, e1, g1) ) { found = true; break; }
|
||||
if( e2.seqEquals(g2, e1, g1) ) { found = true; break; }
|
||||
}
|
||||
if( !found ) { return false; }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -99,6 +99,16 @@ public class BaseVertex {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are b and this equal according to their base sequences?
|
||||
*
|
||||
* @param b the vertex to compare ourselves to
|
||||
* @return true if b and this have the same sequence, regardless of other attributes that might differentiate them
|
||||
*/
|
||||
public boolean seqEquals(final BaseVertex b) {
|
||||
return Arrays.equals(this.getSequence(), b.getSequence());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect
|
||||
return Arrays.hashCode(sequence);
|
||||
|
|
|
|||
|
|
@ -194,15 +194,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR);
|
||||
seqGraph.pruneGraph(PRUNE_FACTOR);
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR);
|
||||
seqGraph.mergeNodes();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.preclean.dot"), PRUNE_FACTOR);
|
||||
seqGraph.removeVerticesNotConnectedToRef();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), PRUNE_FACTOR);
|
||||
seqGraph.mergeBranchingNodes();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.simplified.dot"), PRUNE_FACTOR);
|
||||
seqGraph.mergeNodes();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.6.simplified.merged.dot"), PRUNE_FACTOR);
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR);
|
||||
seqGraph.simplifyGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR);
|
||||
return seqGraph;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,17 +47,20 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* simple node class for storing kmer sequences
|
||||
*
|
||||
* User: ebanks
|
||||
* User: ebanks, mdepristo
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
public class DeBruijnVertex extends BaseVertex {
|
||||
private final static byte[][] sufficesAsByteArray = new byte[256][];
|
||||
static {
|
||||
for ( int i = 0; i < sufficesAsByteArray.length; i++ )
|
||||
sufficesAsByteArray[i] = new byte[]{(byte)(i & 0xFF)};
|
||||
}
|
||||
|
||||
public DeBruijnVertex( final byte[] sequence ) {
|
||||
super(sequence);
|
||||
}
|
||||
|
|
@ -85,17 +88,38 @@ public class DeBruijnVertex extends BaseVertex {
|
|||
*/
|
||||
@Ensures({"result != null", "result.length() >= 1"})
|
||||
public String getSuffixString() {
|
||||
return new String(getSuffix());
|
||||
return new String(getSuffixAsArray());
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
// TODO this could be replaced with byte as the suffix is guarenteed to be exactly 1 base
|
||||
public byte[] getSuffix() {
|
||||
return Arrays.copyOfRange( sequence, getKmer() - 1, sequence.length );
|
||||
/**
|
||||
* Get the suffix byte of this DeBruijnVertex
|
||||
*
|
||||
* The suffix byte is simply the last byte of the kmer sequence, so if this is holding sequence ACT
|
||||
* getSuffix would return T
|
||||
*
|
||||
* @return a byte
|
||||
*/
|
||||
public byte getSuffix() {
|
||||
return sequence[getKmer() - 1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimized version that returns a byte[] for the single byte suffix of this graph without allocating memory.
|
||||
*
|
||||
* Should not be modified
|
||||
*
|
||||
* @return a byte[] that contains 1 byte == getSuffix()
|
||||
*/
|
||||
@Ensures({"result != null", "result.length == 1", "result[0] == getSuffix()"})
|
||||
private byte[] getSuffixAsArray() {
|
||||
return sufficesAsByteArray[getSuffix()];
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public byte[] getAdditionalSequence(boolean source) {
|
||||
return source ? super.getAdditionalSequence(source) : getSuffix();
|
||||
return source ? super.getAdditionalSequence(source) : getSuffixAsArray();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -254,7 +254,7 @@ class Path<T extends BaseVertex> {
|
|||
final BubbleStateMachine<T> bsm = new BubbleStateMachine<T>(cigar);
|
||||
|
||||
for( final BaseEdge e : getEdges() ) {
|
||||
if( e.equals(graph, edgesInOrder.getFirst()) ) {
|
||||
if ( e.hasSameSourceAndTarget(graph, edgesInOrder.getFirst()) ) {
|
||||
advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null );
|
||||
}
|
||||
advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e );
|
||||
|
|
|
|||
|
|
@ -46,6 +46,8 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
|
@ -77,67 +79,83 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
super(kmer);
|
||||
}
|
||||
|
||||
protected void mergeNodes() {
|
||||
/**
|
||||
* Simplify this graph, merging vertices together and restructuring the graph in an
|
||||
* effort to minimize the number of overall vertices in the graph without changing
|
||||
* in any way the sequences implied by a complex enumeration of all paths through the graph.
|
||||
*/
|
||||
public void simplifyGraph() {
|
||||
zipLinearChains();
|
||||
mergeBranchingNodes();
|
||||
zipLinearChains();
|
||||
}
|
||||
|
||||
/**
|
||||
* Zip up all of the simple linear chains present in this graph.
|
||||
*/
|
||||
protected void zipLinearChains() {
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
|
||||
for( final BaseEdge e : edgeSet() ) {
|
||||
final SeqVertex outgoingVertex = getEdgeTarget(e);
|
||||
final SeqVertex incomingVertex = getEdgeSource(e);
|
||||
if( !outgoingVertex.equals(incomingVertex)
|
||||
&& outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1
|
||||
&& isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) {
|
||||
|
||||
final Set<BaseEdge> outEdges = outgoingEdgesOf(outgoingVertex);
|
||||
final Set<BaseEdge> inEdges = incomingEdgesOf(incomingVertex);
|
||||
if( inEdges.size() == 1 && outEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
} else if( inEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
} else if( outEdges.size() == 1 ) {
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
}
|
||||
|
||||
final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) );
|
||||
addVertex(addedVertex);
|
||||
for( final BaseEdge edge : outEdges ) {
|
||||
addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity()));
|
||||
}
|
||||
for( final BaseEdge edge : inEdges ) {
|
||||
addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity()));
|
||||
}
|
||||
|
||||
removeVertex(incomingVertex);
|
||||
removeVertex(outgoingVertex);
|
||||
foundNodesToMerge = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
while( zipOneLinearChain() ) {
|
||||
// just keep going until zipOneLinearChain says its done
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// X -> ABC -> Y
|
||||
// -> aBC -> Y
|
||||
//
|
||||
// becomes
|
||||
//
|
||||
// X -> A -> BCY
|
||||
// -> a -> BCY
|
||||
//
|
||||
public void mergeBranchingNodes() {
|
||||
/**
|
||||
* Merge together two vertices in the graph v1 -> v2 into a single vertex v' containing v1 + v2 sequence
|
||||
*
|
||||
* Only works on vertices where v1's only outgoing edge is to v2 and v2's only incoming edge is from v1.
|
||||
*
|
||||
* If such a pair of vertices is found, they are merged and the graph is update. Otherwise nothing is changed.
|
||||
*
|
||||
* @return true if any such pair of vertices could be found, false otherwise
|
||||
*/
|
||||
protected boolean zipOneLinearChain() {
|
||||
for( final BaseEdge e : edgeSet() ) {
|
||||
final SeqVertex outgoingVertex = getEdgeTarget(e);
|
||||
final SeqVertex incomingVertex = getEdgeSource(e);
|
||||
if( !outgoingVertex.equals(incomingVertex)
|
||||
&& outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1
|
||||
&& isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) {
|
||||
|
||||
final Set<BaseEdge> outEdges = outgoingEdgesOf(outgoingVertex);
|
||||
final Set<BaseEdge> inEdges = incomingEdgesOf(incomingVertex);
|
||||
if( inEdges.size() == 1 && outEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
} else if( inEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
} else if( outEdges.size() == 1 ) {
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
}
|
||||
|
||||
final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) );
|
||||
addVertex(addedVertex);
|
||||
for( final BaseEdge edge : outEdges ) {
|
||||
addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity()));
|
||||
}
|
||||
for( final BaseEdge edge : inEdges ) {
|
||||
addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity()));
|
||||
}
|
||||
|
||||
removeVertex(incomingVertex);
|
||||
removeVertex(outgoingVertex);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform as many branch simplifications and merging operations as possible on this graph,
|
||||
* modifying it in place.
|
||||
*/
|
||||
private void mergeBranchingNodes() {
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
|
||||
for( final SeqVertex v : vertexSet() ) {
|
||||
foundNodesToMerge = simplifyDiamond(v);
|
||||
foundNodesToMerge = simplifyDiamondIfPossible(v);
|
||||
if ( foundNodesToMerge )
|
||||
break;
|
||||
}
|
||||
|
|
@ -153,8 +171,11 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
* \ | / /
|
||||
* b
|
||||
*
|
||||
* @param v
|
||||
* @return
|
||||
* Only returns true if all outgoing edges of v go to vertices that all only connect to
|
||||
* a single bottom node, and that all middle nodes have only the single edge
|
||||
*
|
||||
* @param v the vertex to test if its the top of a diamond pattern
|
||||
* @return true if v is the root of a diamond
|
||||
*/
|
||||
protected boolean isRootOfDiamond(final SeqVertex v) {
|
||||
final Set<BaseEdge> ve = outgoingEdgesOf(v);
|
||||
|
|
@ -173,6 +194,7 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
if ( inDegreeOf(mi) != 1 )
|
||||
return false;
|
||||
|
||||
// make sure that all outgoing vertices of mi go only to the bottom node
|
||||
for ( final SeqVertex mt : outgoingVerticesOf(mi) ) {
|
||||
if ( bottom == null )
|
||||
bottom = mt;
|
||||
|
|
@ -181,9 +203,24 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
}
|
||||
}
|
||||
|
||||
// bottom has some connections coming in from other nodes, don't allow
|
||||
if ( inDegreeOf(bottom) != ve.size() )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the longest suffix of bases shared among all provided vertices
|
||||
*
|
||||
* For example, if the vertices have sequences AC, CC, and ATC, this would return
|
||||
* a single C. However, for ACC and TCC this would return CC. And for AC and TG this
|
||||
* would return null;
|
||||
*
|
||||
* @param middleVertices a non-empty set of vertices
|
||||
* @return
|
||||
*/
|
||||
@Requires("!middleVertices.isEmpty()")
|
||||
private byte[] commonSuffixOfEdgeTargets(final Set<SeqVertex> middleVertices) {
|
||||
final String[] kmers = new String[middleVertices.size()];
|
||||
|
||||
|
|
@ -196,6 +233,14 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the node that is the bottom of a diamond configuration in the graph starting at top
|
||||
*
|
||||
* @param top
|
||||
* @return
|
||||
*/
|
||||
@Requires("top != null")
|
||||
@Ensures({"result != null"})
|
||||
private SeqVertex getDiamondBottom(final SeqVertex top) {
|
||||
final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next();
|
||||
final SeqVertex middle = getEdgeTarget(topEdge);
|
||||
|
|
@ -203,6 +248,13 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
return getEdgeTarget(middleEdge);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of vertices that are in the middle of a diamond starting at top
|
||||
* @param top
|
||||
* @return
|
||||
*/
|
||||
@Requires("top != null")
|
||||
@Ensures({"result != null", "!result.isEmpty()"})
|
||||
final Set<SeqVertex> getMiddleVertices(final SeqVertex top) {
|
||||
final Set<SeqVertex> middles = new HashSet<SeqVertex>();
|
||||
for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) {
|
||||
|
|
@ -211,7 +263,26 @@ public class SeqGraph extends BaseGraph<SeqVertex> {
|
|||
return middles;
|
||||
}
|
||||
|
||||
private boolean simplifyDiamond(final SeqVertex top) {
|
||||
/**
|
||||
* Simply a diamond configuration in the current graph starting at top, if possible
|
||||
*
|
||||
* If top is actually the top of a diamond that can be simplified (i.e., doesn't have any
|
||||
* random edges or other structure that would cause problems with the transformation), then this code
|
||||
* performs the following transformation on this graph (modifying it):
|
||||
*
|
||||
* A -> M1 -> B, A -> M2 -> B, A -> Mn -> B
|
||||
*
|
||||
* becomes
|
||||
*
|
||||
* A -> M1' -> B', A -> M2' -> B', A -> Mn' -> B'
|
||||
*
|
||||
* where B' is composed of the longest common suffix of all Mi nodes + B, and Mi' are each
|
||||
* middle vertex without their shared suffix.
|
||||
*
|
||||
* @param top a proposed vertex in this graph that might start a diamond (but doesn't have to)
|
||||
* @return true top actually starts a diamond and it could be simplified
|
||||
*/
|
||||
private boolean simplifyDiamondIfPossible(final SeqVertex top) {
|
||||
if ( ! isRootOfDiamond(top) )
|
||||
return false;
|
||||
|
||||
|
|
|
|||
|
|
@ -58,8 +58,7 @@ public class DeBruijnVertexUnitTest extends BaseTest {
|
|||
Assert.assertEquals(v.getSequence(), bases);
|
||||
Assert.assertEquals(v.getSequenceString(), new String(bases));
|
||||
Assert.assertEquals(v.length(), bases.length);
|
||||
Assert.assertEquals(v.getSuffix().length, 1);
|
||||
Assert.assertEquals(v.getSuffix()[0], (byte)'T');
|
||||
Assert.assertEquals(v.getSuffix(), (byte)'T');
|
||||
Assert.assertEquals(v.getSuffixString(), "T");
|
||||
|
||||
Assert.assertEquals(v.getAdditionalSequence(true), bases);
|
||||
|
|
|
|||
|
|
@ -51,6 +51,10 @@ import org.testng.Assert;
|
|||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class SeqGraphUnitTest extends BaseTest {
|
||||
private class MergeNodesWithNoVariationTestProvider extends TestDataProvider {
|
||||
public byte[] sequence;
|
||||
|
|
@ -75,7 +79,7 @@ public class SeqGraphUnitTest extends BaseTest {
|
|||
deBruijnGraph.addKmersToGraph(kmer1, kmer2, false, 1);
|
||||
}
|
||||
final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
|
||||
seqGraph.mergeNodes();
|
||||
seqGraph.simplifyGraph();
|
||||
return seqGraph;
|
||||
}
|
||||
}
|
||||
|
|
@ -103,4 +107,208 @@ public class SeqGraphUnitTest extends BaseTest {
|
|||
final SeqVertex actualV = actual.vertexSet().iterator().next();
|
||||
Assert.assertEquals(actualV.getSequence(), cfg.sequence);
|
||||
}
|
||||
|
||||
@DataProvider(name = "IsDiamondData")
|
||||
public Object[][] makeIsDiamondData() throws Exception {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
SeqGraph graph;
|
||||
SeqVertex pre1, pre2, top, middle1, middle2, middle3, bottom, tail1, tail2;
|
||||
|
||||
graph = new SeqGraph();
|
||||
|
||||
pre1 = new SeqVertex("ACT");
|
||||
pre2 = new SeqVertex("AGT");
|
||||
top = new SeqVertex("A");
|
||||
middle1 = new SeqVertex("CT");
|
||||
middle2 = new SeqVertex("CG");
|
||||
middle3 = new SeqVertex("CA");
|
||||
bottom = new SeqVertex("AA");
|
||||
tail1 = new SeqVertex("GC");
|
||||
tail2 = new SeqVertex("GC");
|
||||
|
||||
graph.addVertices(pre1, pre2, top, middle1, middle2, middle3, bottom, tail1, tail2);
|
||||
graph.addEdges(pre1, top, middle1, bottom, tail1);
|
||||
graph.addEdges(pre2, top, middle2, bottom, tail1);
|
||||
graph.addEdges(top, middle3, bottom);
|
||||
graph.addEdges(bottom, tail2);
|
||||
|
||||
for ( final SeqVertex no : Arrays.asList(pre1, pre2, middle1, middle2, middle3, bottom, tail1, tail2)) {
|
||||
tests.add(new Object[]{graph, no, false});
|
||||
}
|
||||
tests.add(new Object[]{graph, top, true});
|
||||
|
||||
final SeqGraph danglingMiddleGraph = (SeqGraph)graph.clone();
|
||||
final SeqVertex danglingMiddle = new SeqVertex("A");
|
||||
danglingMiddleGraph.addVertex(danglingMiddle);
|
||||
danglingMiddleGraph.addEdge(top, danglingMiddle);
|
||||
tests.add(new Object[]{danglingMiddleGraph, top, false});
|
||||
|
||||
final SeqGraph strangerToBottom = (SeqGraph)graph.clone();
|
||||
final SeqVertex notAttachedToTop = new SeqVertex("A");
|
||||
strangerToBottom.addVertex(notAttachedToTop);
|
||||
strangerToBottom.addEdge(notAttachedToTop, bottom);
|
||||
tests.add(new Object[]{strangerToBottom, top, false});
|
||||
|
||||
final SeqGraph strangerToMiddle = (SeqGraph)graph.clone();
|
||||
final SeqVertex attachedToMiddle = new SeqVertex("A");
|
||||
strangerToMiddle.addVertex(attachedToMiddle);
|
||||
strangerToMiddle.addEdge(attachedToMiddle, middle1);
|
||||
tests.add(new Object[]{strangerToMiddle, top, false});
|
||||
|
||||
// middle1 has outgoing edge to non-bottom
|
||||
final SeqGraph middleExtraOut = (SeqGraph)graph.clone();
|
||||
final SeqVertex fromMiddle = new SeqVertex("A");
|
||||
middleExtraOut.addVertex(fromMiddle);
|
||||
middleExtraOut.addEdge(middle1, fromMiddle);
|
||||
tests.add(new Object[]{middleExtraOut, top, false});
|
||||
|
||||
// top connects to bottom directly as well
|
||||
{
|
||||
final SeqGraph topConnectsToBottomToo = new SeqGraph();
|
||||
final SeqVertex top2 = new SeqVertex("A");
|
||||
final SeqVertex middle4 = new SeqVertex("C");
|
||||
final SeqVertex bottom2 = new SeqVertex("G");
|
||||
topConnectsToBottomToo.addVertices(top2, middle4, bottom2);
|
||||
topConnectsToBottomToo.addEdges(top2, middle4, bottom2);
|
||||
topConnectsToBottomToo.addEdges(top2, bottom2);
|
||||
tests.add(new Object[]{topConnectsToBottomToo, top2, false});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "IsDiamondData", enabled = true)
|
||||
public void testIsDiamond(final SeqGraph graph, final SeqVertex v, final boolean isRootOfDiamond) {
|
||||
Assert.assertEquals(graph.isRootOfDiamond(v), isRootOfDiamond);
|
||||
}
|
||||
|
||||
@DataProvider(name = "MergingData")
|
||||
public Object[][] makeMergingData() throws Exception {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final SeqGraph graph = new SeqGraph();
|
||||
|
||||
SeqVertex pre1 = new SeqVertex("ACT");
|
||||
SeqVertex pre2 = new SeqVertex("AGT");
|
||||
SeqVertex top = new SeqVertex("A");
|
||||
SeqVertex middle1 = new SeqVertex("GC");
|
||||
SeqVertex middle2 = new SeqVertex("TC");
|
||||
SeqVertex middle3 = new SeqVertex("AC");
|
||||
SeqVertex middle4 = new SeqVertex("GCAC");
|
||||
SeqVertex bottom = new SeqVertex("AA");
|
||||
SeqVertex tail1 = new SeqVertex("GC");
|
||||
SeqVertex tail2 = new SeqVertex("GC");
|
||||
|
||||
// just a single vertex
|
||||
graph.addVertices(pre1);
|
||||
tests.add(new Object[]{graph.clone(), graph.clone()});
|
||||
|
||||
// pre1 -> top = pre1 + top
|
||||
{
|
||||
graph.addVertices(top);
|
||||
graph.addEdges(pre1, top);
|
||||
final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString());
|
||||
final SeqGraph expected = new SeqGraph();
|
||||
expected.addVertex(pre1_top);
|
||||
tests.add(new Object[]{graph.clone(), expected.clone()});
|
||||
}
|
||||
|
||||
// pre1 -> top -> middle1 = pre1 + top + middle1
|
||||
{
|
||||
graph.addVertices(middle1);
|
||||
graph.addEdges(top, middle1);
|
||||
final SeqGraph expected = new SeqGraph();
|
||||
final SeqVertex pre1_top_middle1 = new SeqVertex(pre1.getSequenceString() + top.getSequenceString() + middle1.getSequenceString());
|
||||
expected.addVertex(pre1_top_middle1);
|
||||
tests.add(new Object[]{graph.clone(), expected});
|
||||
}
|
||||
|
||||
// pre1 -> top -> middle1 & top -> middle2 = pre1 + top -> middle1 & -> middle2
|
||||
{
|
||||
graph.addVertices(middle2);
|
||||
graph.addEdges(top, middle2);
|
||||
final SeqGraph expected = new SeqGraph();
|
||||
final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString());
|
||||
expected.addVertices(pre1_top, middle1, middle2);
|
||||
expected.addEdges(pre1_top, middle1);
|
||||
expected.addEdges(pre1_top, middle2);
|
||||
tests.add(new Object[]{graph.clone(), expected});
|
||||
}
|
||||
|
||||
// An actual diamond event to merge!
|
||||
{
|
||||
graph.addVertices(bottom);
|
||||
graph.addEdges(middle1, bottom);
|
||||
graph.addEdges(middle2, bottom);
|
||||
final SeqGraph expected = new SeqGraph();
|
||||
final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString());
|
||||
final SeqVertex newMiddle1 = new SeqVertex("G");
|
||||
final SeqVertex newMiddle2 = new SeqVertex("T");
|
||||
final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString());
|
||||
expected.addVertices(pre1_top, newMiddle1, newMiddle2, newBottom);
|
||||
expected.addEdges(pre1_top, newMiddle1, newBottom);
|
||||
expected.addEdges(pre1_top, newMiddle2, newBottom);
|
||||
tests.add(new Object[]{graph.clone(), expected.clone()});
|
||||
|
||||
graph.addVertices(middle3);
|
||||
graph.addEdges(top, middle3, bottom);
|
||||
final SeqVertex newMiddle3 = new SeqVertex("A");
|
||||
expected.addVertices(newMiddle3);
|
||||
expected.addEdges(pre1_top, newMiddle3, newBottom);
|
||||
tests.add(new Object[]{graph.clone(), expected.clone()});
|
||||
|
||||
graph.addVertices(middle4);
|
||||
graph.addEdges(top, middle4, bottom);
|
||||
final SeqVertex newMiddle4 = new SeqVertex("GCA");
|
||||
expected.addVertices(newMiddle4);
|
||||
expected.addEdges(pre1_top, newMiddle4, newBottom);
|
||||
tests.add(new Object[]{graph.clone(), expected.clone()});
|
||||
}
|
||||
|
||||
{
|
||||
final SeqGraph all = new SeqGraph();
|
||||
all.addVertices(pre1, pre2, top, middle1, middle2, bottom, tail1, tail2);
|
||||
all.addEdges(pre1, top, middle1, bottom, tail1);
|
||||
all.addEdges(pre2, top, middle2, bottom, tail2);
|
||||
|
||||
final SeqGraph expected = new SeqGraph();
|
||||
final SeqVertex newMiddle1 = new SeqVertex("G");
|
||||
final SeqVertex newMiddle2 = new SeqVertex("T");
|
||||
final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString());
|
||||
expected.addVertices(pre1, pre2, top, newMiddle1, newMiddle2, newBottom, tail1, tail2);
|
||||
expected.addEdges(pre1, top, newMiddle1, newBottom, tail1);
|
||||
expected.addEdges(pre2, top, newMiddle2, newBottom, tail2);
|
||||
tests.add(new Object[]{all.clone(), expected.clone()});
|
||||
}
|
||||
|
||||
// test the case where we delete a middle node away because the common sequence is all of its sequence
|
||||
{
|
||||
final SeqGraph graph2 = new SeqGraph();
|
||||
final SeqVertex mytop = new SeqVertex("A");
|
||||
final SeqVertex mid1 = new SeqVertex("AC");
|
||||
final SeqVertex mid2 = new SeqVertex("C");
|
||||
final SeqVertex bot = new SeqVertex("G");
|
||||
graph2.addVertices(mytop, mid1, mid2, bot);
|
||||
graph2.addEdges(mytop, mid1, bot);
|
||||
graph2.addEdges(mytop, mid2, bot);
|
||||
|
||||
final SeqGraph expected = new SeqGraph();
|
||||
final SeqVertex newMid1 = new SeqVertex("A");
|
||||
final SeqVertex newBottom = new SeqVertex("CG");
|
||||
expected.addVertices(mytop, newMid1, newBottom);
|
||||
expected.addEdges(mytop, newMid1, newBottom);
|
||||
expected.addEdges(mytop, newBottom);
|
||||
tests.add(new Object[]{graph2, expected});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "MergingData", enabled = true)
|
||||
public void testMerging(final SeqGraph graph, final SeqGraph expected) {
|
||||
final SeqGraph merged = (SeqGraph)graph.clone();
|
||||
merged.simplifyGraph();
|
||||
Assert.assertTrue(SeqGraph.graphEquals(merged, expected));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue