From 401d1cb97f5d889d546e86bb13b97828009aabbb Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 23 Mar 2011 17:35:34 +0000 Subject: [PATCH] Bug fixes plus some debugging code added. Broke out DeBruijnVertex into its own class so that the interface is now cleaner. Still very much a work in progress. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5498 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/assembly/DeBruijnVertex.java | 71 +++++++ .../assembly/SimpleDeBruijnAssembler.java | 176 ++++++++++-------- .../assembly/WindowedAssemblyWalker.java | 7 +- 3 files changed, 179 insertions(+), 75 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnVertex.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnVertex.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnVertex.java new file mode 100755 index 000000000..860695ef5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnVertex.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.playground.gatk.walkers.assembly; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Mar 23, 2011 + */ +// simple node class for storing kmer sequences +public class DeBruijnVertex { + + // used for equals() + protected byte[] actualSequence; + + // used for printing and traversing graphs + protected byte[] printableSequence; + + public DeBruijnVertex(byte[] sequence) { + actualSequence = sequence; + printableSequence = new byte[sequence.length]; + System.arraycopy(sequence, 0, printableSequence, 0, sequence.length); + } + + public boolean equals(DeBruijnVertex v) { + return Arrays.equals(actualSequence, v.actualSequence); + } + + public String toString() { + return new String(printableSequence); + } + + public void addPrefix(byte[] prefix, boolean justPrintableSequence) { + printableSequence = addPrefix(printableSequence, prefix); + if ( !justPrintableSequence ) + actualSequence = addPrefix(actualSequence, prefix); + } + + private static byte[] addPrefix(byte[] sequence, byte[] prefix) { + byte[] newSequence = new byte[sequence.length + prefix.length]; + System.arraycopy(prefix, 0, newSequence, 0, prefix.length); + System.arraycopy(sequence, 0, newSequence, prefix.length, sequence.length); + return newSequence; + } + + public void removePrefix(int prefixLength, boolean justPrintableSequence) { + printableSequence = removePrefix(printableSequence, prefixLength); + if ( !justPrintableSequence ) + actualSequence = removePrefix(actualSequence, prefixLength); + } + + private static byte[] removePrefix(byte[] sequence, int prefixLength) { + int newLength = sequence.length - prefixLength; + byte[] newSequence = new byte[newLength]; + System.arraycopy(sequence, prefixLength, newSequence, 0, newLength); + return newSequence; + } + + public void removeSuffix(int suffixLength, boolean justPrintableSequence) { + printableSequence = removeSuffix(printableSequence, suffixLength); + if ( !justPrintableSequence ) + actualSequence = removeSuffix(actualSequence, suffixLength); + } + + private static byte[] removeSuffix(byte[] sequence, int suffixLength) { + int newLength = sequence.length - suffixLength; + byte[] newSequence = new byte[newLength]; + System.arraycopy(sequence, 0, newSequence, 0, newLength); + return newSequence; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java index 04045e1f8..9359e4389 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java @@ -25,22 +25,15 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // minimum clipped sequence length to consider using private static final int MIN_SEQUENCE_LENGTH = 30; + // minimum multiplicity to consider using + // private static final int MIN_MULTIPLICITY_TO_USE = 2; + + // FOR DEBUGGING + private int numReadsToUse = -1; + // the deBruijn graph object private DefaultDirectedGraph graph = null; - // simple node class for storing kmer sequences - protected class DeBruijnVertex { - protected byte[] sequence; - - public DeBruijnVertex(byte[] sequence) { - this.sequence = sequence; - } - - public boolean equals(DeBruijnVertex v) { - return Arrays.equals(sequence, v.sequence); - } - } - // simple edge class for connecting nodes in the graph protected class DeBruijnEdge { private int multiplicity; @@ -59,8 +52,9 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } - public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader) { + public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader, int numReadsToUse) { super(out, referenceReader); + this.numReadsToUse = numReadsToUse; } public void runLocalAssembly(List reads) { @@ -84,7 +78,16 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { private List clipReads(List reads) { List sequences = new ArrayList(reads.size()); + int counter = 0; + for ( SAMRecord read : reads ) { + + // for debugging + if ( numReadsToUse >= 0 && ++counter > numReadsToUse ) { + System.out.println("Stopping before read: " + read.getReadName() + " at " + read.getAlignmentStart()); + break; + } + byte[] sequencedReadBases = read.getReadBases(); byte[] sequencedBaseQuals = read.getBaseQualities(); @@ -165,6 +168,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // create the graph createGraphFromSequences(reads); + // remove nodes with incoming multiplicity of N + // if ( MIN_MULTIPLICITY_TO_USE > 0 ) + // removeNodesWithLowMultiplicity(); + // cleanup graph by merging nodes concatenateNodes(); @@ -176,6 +183,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } private void createGraphFromSequences(List reads) { + for ( byte[] sequence : reads ) { final int kmersInSequence = sequence.length - KMER_LENGTH + 1; @@ -214,8 +222,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } private DeBruijnVertex addToGraphIfNew(byte[] kmer) { - // the graph.containsVertex() method is busted, so here's a hack around it + // the graph.containsVertex() method is busted, so here's a hack around it DeBruijnVertex newV = new DeBruijnVertex(kmer); for ( DeBruijnVertex v : graph.vertexSet() ) { if ( v.equals(newV) ) @@ -275,10 +283,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // (Vx -> V12 -> Vy) // create V12 - int additionalSequenceFromV2 = V2.sequence.length - KMER_LENGTH + 1; - byte[] newKmer = new byte[V1.sequence.length + additionalSequenceFromV2]; - System.arraycopy(V1.sequence, 0, newKmer, 0, V1.sequence.length); - System.arraycopy(V2.sequence, KMER_LENGTH - 1, newKmer, V1.sequence.length, additionalSequenceFromV2); + int additionalSequenceFromV2 = V2.actualSequence.length - KMER_LENGTH + 1; + byte[] newKmer = new byte[V1.actualSequence.length + additionalSequenceFromV2]; + System.arraycopy(V1.actualSequence, 0, newKmer, 0, V1.actualSequence.length); + System.arraycopy(V2.actualSequence, KMER_LENGTH - 1, newKmer, V1.actualSequence.length, additionalSequenceFromV2); DeBruijnVertex V12 = new DeBruijnVertex(newKmer); graph.addVertex(V12); @@ -307,22 +315,34 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { private void cleanupNodeSequences() { + // remove the first k-1 bases of the kmers for ( DeBruijnVertex v : graph.vertexSet() ) { - - // remove the first k-1 bases of the kmers if ( graph.inDegreeOf(v) > 0 ) - removeKmerPrefix(v); - - // move common suffixes from incoming nodes to this one - if ( graph.inDegreeOf(v) > 1 ) { - Set connectedVs = new HashSet(); - for ( DeBruijnEdge edge : graph.incomingEdgesOf(v) ) - connectedVs.add(graph.getEdgeSource(edge)); - propagateCommonSuffix(v, connectedVs); - } + v.removePrefix(KMER_LENGTH - 1, true); } - removeEmptyNodes(); + // move common suffixes from incoming nodes to this one + while ( true ) { + + boolean graphWasModified = false; + for ( DeBruijnVertex v : graph.vertexSet() ) { + + if ( graph.inDegreeOf(v) > 1 ) { + Set connectedVs = new HashSet(); + for ( DeBruijnEdge edge : graph.incomingEdgesOf(v) ) + connectedVs.add(graph.getEdgeSource(edge)); + + if ( propagateCommonSuffix(v, connectedVs) ) { + removeEmptyNodes(); + graphWasModified = true; + break; + } + } + } + + if ( !graphWasModified ) + break; + } } private void removeEmptyNodes() { @@ -332,26 +352,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { boolean graphWasModified = false; for ( DeBruijnVertex v : graph.vertexSet() ) { - if ( v.sequence.length == 0 ) { - - Set incoming = graph.incomingEdgesOf(v); - Set outgoing = graph.outgoingEdgesOf(v); - - // make edges from all incoming nodes to all outgoing nodes - for ( DeBruijnEdge Ex : incoming ) { - DeBruijnVertex Vx = graph.getEdgeSource(Ex); - for ( DeBruijnEdge Ey : outgoing ) { - DeBruijnVertex Vy = graph.getEdgeTarget(Ey); - - DeBruijnEdge newEdge = new DeBruijnEdge(); - newEdge.setMultiplicity(Ex.getMultiplicity()); - graph.addEdge(Vx, Vy, newEdge); - } - } - - // remove v and its associated edges - graph.removeVertex(v); - + if ( v.printableSequence.length == 0 ) { + removeNode(v); graphWasModified = true; break; } @@ -362,28 +364,41 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } } - private void removeKmerPrefix(DeBruijnVertex v) { - int newLength = v.sequence.length - KMER_LENGTH + 1; - byte[] newSequence = new byte[newLength]; - System.arraycopy(v.sequence, KMER_LENGTH - 1, newSequence, 0, newLength); - v.sequence = newSequence; + private void removeNode(DeBruijnVertex v) { + Set incoming = graph.incomingEdgesOf(v); + Set outgoing = graph.outgoingEdgesOf(v); + + // make edges from all incoming nodes to all outgoing nodes + for ( DeBruijnEdge Ex : incoming ) { + DeBruijnVertex Vx = graph.getEdgeSource(Ex); + for ( DeBruijnEdge Ey : outgoing ) { + DeBruijnVertex Vy = graph.getEdgeTarget(Ey); + + DeBruijnEdge newEdge = new DeBruijnEdge(); + newEdge.setMultiplicity(Ex.getMultiplicity()); + graph.addEdge(Vx, Vy, newEdge); + } + } + + // remove v and its associated edges + graph.removeVertex(v); } - private void propagateCommonSuffix(DeBruijnVertex Vx, Set incoming) { + private boolean propagateCommonSuffix(DeBruijnVertex Vx, Set incoming) { // find the common matching suffix byte[] match = null; for ( DeBruijnVertex v : incoming ) { if ( match == null ) { - match = v.sequence; + match = v.printableSequence; } else { int idx = 0; - while ( idx < match.length && idx < v.sequence.length && match[match.length - idx - 1] == v.sequence[v.sequence.length - idx - 1] ) + while ( idx < match.length && idx < v.printableSequence.length && match[match.length - idx - 1] == v.printableSequence[v.printableSequence.length - idx - 1] ) idx++; if ( idx < match.length ) { match = new byte[idx]; - System.arraycopy(v.sequence, v.sequence.length - idx, match, 0, idx); + System.arraycopy(v.printableSequence, v.printableSequence.length - idx, match, 0, idx); } } } @@ -391,28 +406,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // if there is a common suffix... if ( match != null && match.length > 0 ) { - // remove it from the end of the incoming nodes - for ( DeBruijnVertex v : incoming ) { - int newLength = v.sequence.length - match.length; - byte[] newSequence = new byte[newLength]; - System.arraycopy(v.sequence, 0, newSequence, 0, newLength); - v.sequence = newSequence; - } + // remove the suffix from the end of the incoming nodes... + for ( DeBruijnVertex v : incoming ) + v.removeSuffix(match.length, false); - // and put it at the front of this node - byte[] newSequence = new byte[Vx.sequence.length + match.length]; - System.arraycopy(match, 0, newSequence, 0, match.length); - System.arraycopy(Vx.sequence, 0, newSequence, match.length, Vx.sequence.length); - Vx.sequence = newSequence; + // ...and put it at the front of this node + Vx.addPrefix(match, false); + return true; } + + return false; } private void printGraph() { for ( DeBruijnVertex source : graph.vertexSet() ) { - getOutputStream().print(new String(source.sequence) + " -> "); + getOutputStream().print(source + " -> "); for ( DeBruijnEdge edge : graph.outgoingEdgesOf(source) ) { - getOutputStream().print(new String(graph.getEdgeTarget(edge).sequence) + " (" + edge.getMultiplicity() + "), "); + getOutputStream().print(graph.getEdgeTarget(edge) + " (" + edge.getMultiplicity() + "), "); } getOutputStream().println(); } @@ -424,4 +435,21 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // TODO -- implement me } + + /**** + private void removeNodesWithLowMultiplicity() { + + Set vertexSet = graph.vertexSet(); + // convert to array because results of the iteration on a set are undefined when the graph is modified + ArrayList vertices = new ArrayList(vertexSet); + + for (int i = 0; i < vertices.size(); i++) { + + DeBruijnVertex v = vertices.get(i); + if ( graph.inDegreeOf(v) == 1 && + graph.incomingEdgesOf(v).iterator().next().getMultiplicity() < MIN_MULTIPLICITY_TO_USE ) + removeNode(v); + } + } + ****/ } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java index 1691d88c7..bd5acd269 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.assembly; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; @@ -53,6 +54,10 @@ public class WindowedAssemblyWalker extends ReadWalker { @Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false) protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN; + @Hidden + @Argument(fullName = "readsToUse", shortName = "readsToUse", doc = "For debugging: how many reads to use") + protected int numReadsToUse = -1; + // the assembly engine LocalAssemblyEngine assemblyEngine = null; @@ -90,7 +95,7 @@ public class WindowedAssemblyWalker extends ReadWalker { private LocalAssemblyEngine makeAssembler(LocalAssemblyEngine.ASSEMBLER type, IndexedFastaSequenceFile referenceReader) { switch ( type ) { case SIMPLE_DE_BRUIJN: - return new SimpleDeBruijnAssembler(graphWriter, referenceReader); + return new SimpleDeBruijnAssembler(graphWriter, referenceReader, numReadsToUse); default: throw new UserException.BadInput("Assembler type " + type + " is not valid/supported"); }