Bug fixes plus some debugging code added. Broke out DeBruijnVertex into its own class so that the interface is now cleaner. Still very much a work in progress.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5498 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
37fbf17da8
commit
401d1cb97f
|
|
@ -0,0 +1,71 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.assembly;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
// simple node class for storing kmer sequences
|
||||
public class DeBruijnVertex {
|
||||
|
||||
// used for equals()
|
||||
protected byte[] actualSequence;
|
||||
|
||||
// used for printing and traversing graphs
|
||||
protected byte[] printableSequence;
|
||||
|
||||
public DeBruijnVertex(byte[] sequence) {
|
||||
actualSequence = sequence;
|
||||
printableSequence = new byte[sequence.length];
|
||||
System.arraycopy(sequence, 0, printableSequence, 0, sequence.length);
|
||||
}
|
||||
|
||||
public boolean equals(DeBruijnVertex v) {
|
||||
return Arrays.equals(actualSequence, v.actualSequence);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return new String(printableSequence);
|
||||
}
|
||||
|
||||
public void addPrefix(byte[] prefix, boolean justPrintableSequence) {
|
||||
printableSequence = addPrefix(printableSequence, prefix);
|
||||
if ( !justPrintableSequence )
|
||||
actualSequence = addPrefix(actualSequence, prefix);
|
||||
}
|
||||
|
||||
private static byte[] addPrefix(byte[] sequence, byte[] prefix) {
|
||||
byte[] newSequence = new byte[sequence.length + prefix.length];
|
||||
System.arraycopy(prefix, 0, newSequence, 0, prefix.length);
|
||||
System.arraycopy(sequence, 0, newSequence, prefix.length, sequence.length);
|
||||
return newSequence;
|
||||
}
|
||||
|
||||
public void removePrefix(int prefixLength, boolean justPrintableSequence) {
|
||||
printableSequence = removePrefix(printableSequence, prefixLength);
|
||||
if ( !justPrintableSequence )
|
||||
actualSequence = removePrefix(actualSequence, prefixLength);
|
||||
}
|
||||
|
||||
private static byte[] removePrefix(byte[] sequence, int prefixLength) {
|
||||
int newLength = sequence.length - prefixLength;
|
||||
byte[] newSequence = new byte[newLength];
|
||||
System.arraycopy(sequence, prefixLength, newSequence, 0, newLength);
|
||||
return newSequence;
|
||||
}
|
||||
|
||||
public void removeSuffix(int suffixLength, boolean justPrintableSequence) {
|
||||
printableSequence = removeSuffix(printableSequence, suffixLength);
|
||||
if ( !justPrintableSequence )
|
||||
actualSequence = removeSuffix(actualSequence, suffixLength);
|
||||
}
|
||||
|
||||
private static byte[] removeSuffix(byte[] sequence, int suffixLength) {
|
||||
int newLength = sequence.length - suffixLength;
|
||||
byte[] newSequence = new byte[newLength];
|
||||
System.arraycopy(sequence, 0, newSequence, 0, newLength);
|
||||
return newSequence;
|
||||
}
|
||||
}
|
||||
|
|
@ -25,22 +25,15 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// minimum clipped sequence length to consider using
|
||||
private static final int MIN_SEQUENCE_LENGTH = 30;
|
||||
|
||||
// minimum multiplicity to consider using
|
||||
// private static final int MIN_MULTIPLICITY_TO_USE = 2;
|
||||
|
||||
// FOR DEBUGGING
|
||||
private int numReadsToUse = -1;
|
||||
|
||||
// the deBruijn graph object
|
||||
private DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = null;
|
||||
|
||||
// simple node class for storing kmer sequences
|
||||
protected class DeBruijnVertex {
|
||||
protected byte[] sequence;
|
||||
|
||||
public DeBruijnVertex(byte[] sequence) {
|
||||
this.sequence = sequence;
|
||||
}
|
||||
|
||||
public boolean equals(DeBruijnVertex v) {
|
||||
return Arrays.equals(sequence, v.sequence);
|
||||
}
|
||||
}
|
||||
|
||||
// simple edge class for connecting nodes in the graph
|
||||
protected class DeBruijnEdge {
|
||||
private int multiplicity;
|
||||
|
|
@ -59,8 +52,9 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
|
||||
public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader) {
|
||||
public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader, int numReadsToUse) {
|
||||
super(out, referenceReader);
|
||||
this.numReadsToUse = numReadsToUse;
|
||||
}
|
||||
|
||||
public void runLocalAssembly(List<SAMRecord> reads) {
|
||||
|
|
@ -84,7 +78,16 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
private List<byte[]> clipReads(List<SAMRecord> reads) {
|
||||
List<byte[]> sequences = new ArrayList<byte[]>(reads.size());
|
||||
|
||||
int counter = 0;
|
||||
|
||||
for ( SAMRecord read : reads ) {
|
||||
|
||||
// for debugging
|
||||
if ( numReadsToUse >= 0 && ++counter > numReadsToUse ) {
|
||||
System.out.println("Stopping before read: " + read.getReadName() + " at " + read.getAlignmentStart());
|
||||
break;
|
||||
}
|
||||
|
||||
byte[] sequencedReadBases = read.getReadBases();
|
||||
byte[] sequencedBaseQuals = read.getBaseQualities();
|
||||
|
||||
|
|
@ -165,6 +168,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// create the graph
|
||||
createGraphFromSequences(reads);
|
||||
|
||||
// remove nodes with incoming multiplicity of N
|
||||
// if ( MIN_MULTIPLICITY_TO_USE > 0 )
|
||||
// removeNodesWithLowMultiplicity();
|
||||
|
||||
// cleanup graph by merging nodes
|
||||
concatenateNodes();
|
||||
|
||||
|
|
@ -176,6 +183,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
private void createGraphFromSequences(List<byte[]> reads) {
|
||||
|
||||
for ( byte[] sequence : reads ) {
|
||||
|
||||
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
|
||||
|
|
@ -214,8 +222,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
private DeBruijnVertex addToGraphIfNew(byte[] kmer) {
|
||||
// the graph.containsVertex() method is busted, so here's a hack around it
|
||||
|
||||
// the graph.containsVertex() method is busted, so here's a hack around it
|
||||
DeBruijnVertex newV = new DeBruijnVertex(kmer);
|
||||
for ( DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if ( v.equals(newV) )
|
||||
|
|
@ -275,10 +283,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// (Vx -> V12 -> Vy)
|
||||
|
||||
// create V12
|
||||
int additionalSequenceFromV2 = V2.sequence.length - KMER_LENGTH + 1;
|
||||
byte[] newKmer = new byte[V1.sequence.length + additionalSequenceFromV2];
|
||||
System.arraycopy(V1.sequence, 0, newKmer, 0, V1.sequence.length);
|
||||
System.arraycopy(V2.sequence, KMER_LENGTH - 1, newKmer, V1.sequence.length, additionalSequenceFromV2);
|
||||
int additionalSequenceFromV2 = V2.actualSequence.length - KMER_LENGTH + 1;
|
||||
byte[] newKmer = new byte[V1.actualSequence.length + additionalSequenceFromV2];
|
||||
System.arraycopy(V1.actualSequence, 0, newKmer, 0, V1.actualSequence.length);
|
||||
System.arraycopy(V2.actualSequence, KMER_LENGTH - 1, newKmer, V1.actualSequence.length, additionalSequenceFromV2);
|
||||
DeBruijnVertex V12 = new DeBruijnVertex(newKmer);
|
||||
graph.addVertex(V12);
|
||||
|
||||
|
|
@ -307,22 +315,34 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
private void cleanupNodeSequences() {
|
||||
|
||||
// remove the first k-1 bases of the kmers
|
||||
for ( DeBruijnVertex v : graph.vertexSet() ) {
|
||||
|
||||
// remove the first k-1 bases of the kmers
|
||||
if ( graph.inDegreeOf(v) > 0 )
|
||||
removeKmerPrefix(v);
|
||||
|
||||
// move common suffixes from incoming nodes to this one
|
||||
if ( graph.inDegreeOf(v) > 1 ) {
|
||||
Set<DeBruijnVertex> connectedVs = new HashSet<DeBruijnVertex>();
|
||||
for ( DeBruijnEdge edge : graph.incomingEdgesOf(v) )
|
||||
connectedVs.add(graph.getEdgeSource(edge));
|
||||
propagateCommonSuffix(v, connectedVs);
|
||||
}
|
||||
v.removePrefix(KMER_LENGTH - 1, true);
|
||||
}
|
||||
|
||||
removeEmptyNodes();
|
||||
// move common suffixes from incoming nodes to this one
|
||||
while ( true ) {
|
||||
|
||||
boolean graphWasModified = false;
|
||||
for ( DeBruijnVertex v : graph.vertexSet() ) {
|
||||
|
||||
if ( graph.inDegreeOf(v) > 1 ) {
|
||||
Set<DeBruijnVertex> connectedVs = new HashSet<DeBruijnVertex>();
|
||||
for ( DeBruijnEdge edge : graph.incomingEdgesOf(v) )
|
||||
connectedVs.add(graph.getEdgeSource(edge));
|
||||
|
||||
if ( propagateCommonSuffix(v, connectedVs) ) {
|
||||
removeEmptyNodes();
|
||||
graphWasModified = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( !graphWasModified )
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private void removeEmptyNodes() {
|
||||
|
|
@ -332,26 +352,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
boolean graphWasModified = false;
|
||||
for ( DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if ( v.sequence.length == 0 ) {
|
||||
|
||||
Set<DeBruijnEdge> incoming = graph.incomingEdgesOf(v);
|
||||
Set<DeBruijnEdge> outgoing = graph.outgoingEdgesOf(v);
|
||||
|
||||
// make edges from all incoming nodes to all outgoing nodes
|
||||
for ( DeBruijnEdge Ex : incoming ) {
|
||||
DeBruijnVertex Vx = graph.getEdgeSource(Ex);
|
||||
for ( DeBruijnEdge Ey : outgoing ) {
|
||||
DeBruijnVertex Vy = graph.getEdgeTarget(Ey);
|
||||
|
||||
DeBruijnEdge newEdge = new DeBruijnEdge();
|
||||
newEdge.setMultiplicity(Ex.getMultiplicity());
|
||||
graph.addEdge(Vx, Vy, newEdge);
|
||||
}
|
||||
}
|
||||
|
||||
// remove v and its associated edges
|
||||
graph.removeVertex(v);
|
||||
|
||||
if ( v.printableSequence.length == 0 ) {
|
||||
removeNode(v);
|
||||
graphWasModified = true;
|
||||
break;
|
||||
}
|
||||
|
|
@ -362,28 +364,41 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
private void removeKmerPrefix(DeBruijnVertex v) {
|
||||
int newLength = v.sequence.length - KMER_LENGTH + 1;
|
||||
byte[] newSequence = new byte[newLength];
|
||||
System.arraycopy(v.sequence, KMER_LENGTH - 1, newSequence, 0, newLength);
|
||||
v.sequence = newSequence;
|
||||
private void removeNode(DeBruijnVertex v) {
|
||||
Set<DeBruijnEdge> incoming = graph.incomingEdgesOf(v);
|
||||
Set<DeBruijnEdge> outgoing = graph.outgoingEdgesOf(v);
|
||||
|
||||
// make edges from all incoming nodes to all outgoing nodes
|
||||
for ( DeBruijnEdge Ex : incoming ) {
|
||||
DeBruijnVertex Vx = graph.getEdgeSource(Ex);
|
||||
for ( DeBruijnEdge Ey : outgoing ) {
|
||||
DeBruijnVertex Vy = graph.getEdgeTarget(Ey);
|
||||
|
||||
DeBruijnEdge newEdge = new DeBruijnEdge();
|
||||
newEdge.setMultiplicity(Ex.getMultiplicity());
|
||||
graph.addEdge(Vx, Vy, newEdge);
|
||||
}
|
||||
}
|
||||
|
||||
// remove v and its associated edges
|
||||
graph.removeVertex(v);
|
||||
}
|
||||
|
||||
private void propagateCommonSuffix(DeBruijnVertex Vx, Set<DeBruijnVertex> incoming) {
|
||||
private boolean propagateCommonSuffix(DeBruijnVertex Vx, Set<DeBruijnVertex> incoming) {
|
||||
|
||||
// find the common matching suffix
|
||||
byte[] match = null;
|
||||
for ( DeBruijnVertex v : incoming ) {
|
||||
if ( match == null ) {
|
||||
match = v.sequence;
|
||||
match = v.printableSequence;
|
||||
} else {
|
||||
int idx = 0;
|
||||
while ( idx < match.length && idx < v.sequence.length && match[match.length - idx - 1] == v.sequence[v.sequence.length - idx - 1] )
|
||||
while ( idx < match.length && idx < v.printableSequence.length && match[match.length - idx - 1] == v.printableSequence[v.printableSequence.length - idx - 1] )
|
||||
idx++;
|
||||
|
||||
if ( idx < match.length ) {
|
||||
match = new byte[idx];
|
||||
System.arraycopy(v.sequence, v.sequence.length - idx, match, 0, idx);
|
||||
System.arraycopy(v.printableSequence, v.printableSequence.length - idx, match, 0, idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -391,28 +406,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// if there is a common suffix...
|
||||
if ( match != null && match.length > 0 ) {
|
||||
|
||||
// remove it from the end of the incoming nodes
|
||||
for ( DeBruijnVertex v : incoming ) {
|
||||
int newLength = v.sequence.length - match.length;
|
||||
byte[] newSequence = new byte[newLength];
|
||||
System.arraycopy(v.sequence, 0, newSequence, 0, newLength);
|
||||
v.sequence = newSequence;
|
||||
}
|
||||
// remove the suffix from the end of the incoming nodes...
|
||||
for ( DeBruijnVertex v : incoming )
|
||||
v.removeSuffix(match.length, false);
|
||||
|
||||
// and put it at the front of this node
|
||||
byte[] newSequence = new byte[Vx.sequence.length + match.length];
|
||||
System.arraycopy(match, 0, newSequence, 0, match.length);
|
||||
System.arraycopy(Vx.sequence, 0, newSequence, match.length, Vx.sequence.length);
|
||||
Vx.sequence = newSequence;
|
||||
// ...and put it at the front of this node
|
||||
Vx.addPrefix(match, false);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private void printGraph() {
|
||||
|
||||
for ( DeBruijnVertex source : graph.vertexSet() ) {
|
||||
getOutputStream().print(new String(source.sequence) + " -> ");
|
||||
getOutputStream().print(source + " -> ");
|
||||
for ( DeBruijnEdge edge : graph.outgoingEdgesOf(source) ) {
|
||||
getOutputStream().print(new String(graph.getEdgeTarget(edge).sequence) + " (" + edge.getMultiplicity() + "), ");
|
||||
getOutputStream().print(graph.getEdgeTarget(edge) + " (" + edge.getMultiplicity() + "), ");
|
||||
}
|
||||
getOutputStream().println();
|
||||
}
|
||||
|
|
@ -424,4 +435,21 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// TODO -- implement me
|
||||
|
||||
}
|
||||
|
||||
/****
|
||||
private void removeNodesWithLowMultiplicity() {
|
||||
|
||||
Set<DeBruijnVertex> vertexSet = graph.vertexSet();
|
||||
// convert to array because results of the iteration on a set are undefined when the graph is modified
|
||||
ArrayList<DeBruijnVertex> vertices = new ArrayList<DeBruijnVertex>(vertexSet);
|
||||
|
||||
for (int i = 0; i < vertices.size(); i++) {
|
||||
|
||||
DeBruijnVertex v = vertices.get(i);
|
||||
if ( graph.inDegreeOf(v) == 1 &&
|
||||
graph.incomingEdgesOf(v).iterator().next().getMultiplicity() < MIN_MULTIPLICITY_TO_USE )
|
||||
removeNode(v);
|
||||
}
|
||||
}
|
||||
****/
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.assembly;
|
|||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
|
|
@ -53,6 +54,10 @@ public class WindowedAssemblyWalker extends ReadWalker<SAMRecord, Integer> {
|
|||
@Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false)
|
||||
protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "readsToUse", shortName = "readsToUse", doc = "For debugging: how many reads to use")
|
||||
protected int numReadsToUse = -1;
|
||||
|
||||
// the assembly engine
|
||||
LocalAssemblyEngine assemblyEngine = null;
|
||||
|
||||
|
|
@ -90,7 +95,7 @@ public class WindowedAssemblyWalker extends ReadWalker<SAMRecord, Integer> {
|
|||
private LocalAssemblyEngine makeAssembler(LocalAssemblyEngine.ASSEMBLER type, IndexedFastaSequenceFile referenceReader) {
|
||||
switch ( type ) {
|
||||
case SIMPLE_DE_BRUIJN:
|
||||
return new SimpleDeBruijnAssembler(graphWriter, referenceReader);
|
||||
return new SimpleDeBruijnAssembler(graphWriter, referenceReader, numReadsToUse);
|
||||
default:
|
||||
throw new UserException.BadInput("Assembler type " + type + " is not valid/supported");
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue