Bug fixes plus some debugging code added. Broke out DeBruijnVertex into its own class so that the interface is now cleaner. Still very much a work in progress.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5498 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2011-03-23 17:35:34 +00:00
parent 37fbf17da8
commit 401d1cb97f
3 changed files with 179 additions and 75 deletions

View File

@ -0,0 +1,71 @@
package org.broadinstitute.sting.playground.gatk.walkers.assembly;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Mar 23, 2011
*/
// simple node class for storing kmer sequences
public class DeBruijnVertex {
// used for equals()
protected byte[] actualSequence;
// used for printing and traversing graphs
protected byte[] printableSequence;
public DeBruijnVertex(byte[] sequence) {
actualSequence = sequence;
printableSequence = new byte[sequence.length];
System.arraycopy(sequence, 0, printableSequence, 0, sequence.length);
}
public boolean equals(DeBruijnVertex v) {
return Arrays.equals(actualSequence, v.actualSequence);
}
public String toString() {
return new String(printableSequence);
}
public void addPrefix(byte[] prefix, boolean justPrintableSequence) {
printableSequence = addPrefix(printableSequence, prefix);
if ( !justPrintableSequence )
actualSequence = addPrefix(actualSequence, prefix);
}
private static byte[] addPrefix(byte[] sequence, byte[] prefix) {
byte[] newSequence = new byte[sequence.length + prefix.length];
System.arraycopy(prefix, 0, newSequence, 0, prefix.length);
System.arraycopy(sequence, 0, newSequence, prefix.length, sequence.length);
return newSequence;
}
public void removePrefix(int prefixLength, boolean justPrintableSequence) {
printableSequence = removePrefix(printableSequence, prefixLength);
if ( !justPrintableSequence )
actualSequence = removePrefix(actualSequence, prefixLength);
}
private static byte[] removePrefix(byte[] sequence, int prefixLength) {
int newLength = sequence.length - prefixLength;
byte[] newSequence = new byte[newLength];
System.arraycopy(sequence, prefixLength, newSequence, 0, newLength);
return newSequence;
}
public void removeSuffix(int suffixLength, boolean justPrintableSequence) {
printableSequence = removeSuffix(printableSequence, suffixLength);
if ( !justPrintableSequence )
actualSequence = removeSuffix(actualSequence, suffixLength);
}
private static byte[] removeSuffix(byte[] sequence, int suffixLength) {
int newLength = sequence.length - suffixLength;
byte[] newSequence = new byte[newLength];
System.arraycopy(sequence, 0, newSequence, 0, newLength);
return newSequence;
}
}

View File

@ -25,22 +25,15 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
// minimum clipped sequence length to consider using // minimum clipped sequence length to consider using
private static final int MIN_SEQUENCE_LENGTH = 30; private static final int MIN_SEQUENCE_LENGTH = 30;
// minimum multiplicity to consider using
// private static final int MIN_MULTIPLICITY_TO_USE = 2;
// FOR DEBUGGING
private int numReadsToUse = -1;
// the deBruijn graph object // the deBruijn graph object
private DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = null; private DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = null;
// simple node class for storing kmer sequences
protected class DeBruijnVertex {
protected byte[] sequence;
public DeBruijnVertex(byte[] sequence) {
this.sequence = sequence;
}
public boolean equals(DeBruijnVertex v) {
return Arrays.equals(sequence, v.sequence);
}
}
// simple edge class for connecting nodes in the graph // simple edge class for connecting nodes in the graph
protected class DeBruijnEdge { protected class DeBruijnEdge {
private int multiplicity; private int multiplicity;
@ -59,8 +52,9 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
} }
public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader) { public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader, int numReadsToUse) {
super(out, referenceReader); super(out, referenceReader);
this.numReadsToUse = numReadsToUse;
} }
public void runLocalAssembly(List<SAMRecord> reads) { public void runLocalAssembly(List<SAMRecord> reads) {
@ -84,7 +78,16 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
private List<byte[]> clipReads(List<SAMRecord> reads) { private List<byte[]> clipReads(List<SAMRecord> reads) {
List<byte[]> sequences = new ArrayList<byte[]>(reads.size()); List<byte[]> sequences = new ArrayList<byte[]>(reads.size());
int counter = 0;
for ( SAMRecord read : reads ) { for ( SAMRecord read : reads ) {
// for debugging
if ( numReadsToUse >= 0 && ++counter > numReadsToUse ) {
System.out.println("Stopping before read: " + read.getReadName() + " at " + read.getAlignmentStart());
break;
}
byte[] sequencedReadBases = read.getReadBases(); byte[] sequencedReadBases = read.getReadBases();
byte[] sequencedBaseQuals = read.getBaseQualities(); byte[] sequencedBaseQuals = read.getBaseQualities();
@ -165,6 +168,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
// create the graph // create the graph
createGraphFromSequences(reads); createGraphFromSequences(reads);
// remove nodes with incoming multiplicity of N
// if ( MIN_MULTIPLICITY_TO_USE > 0 )
// removeNodesWithLowMultiplicity();
// cleanup graph by merging nodes // cleanup graph by merging nodes
concatenateNodes(); concatenateNodes();
@ -176,6 +183,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
} }
private void createGraphFromSequences(List<byte[]> reads) { private void createGraphFromSequences(List<byte[]> reads) {
for ( byte[] sequence : reads ) { for ( byte[] sequence : reads ) {
final int kmersInSequence = sequence.length - KMER_LENGTH + 1; final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
@ -214,8 +222,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
} }
private DeBruijnVertex addToGraphIfNew(byte[] kmer) { private DeBruijnVertex addToGraphIfNew(byte[] kmer) {
// the graph.containsVertex() method is busted, so here's a hack around it
// the graph.containsVertex() method is busted, so here's a hack around it
DeBruijnVertex newV = new DeBruijnVertex(kmer); DeBruijnVertex newV = new DeBruijnVertex(kmer);
for ( DeBruijnVertex v : graph.vertexSet() ) { for ( DeBruijnVertex v : graph.vertexSet() ) {
if ( v.equals(newV) ) if ( v.equals(newV) )
@ -275,10 +283,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
// (Vx -> V12 -> Vy) // (Vx -> V12 -> Vy)
// create V12 // create V12
int additionalSequenceFromV2 = V2.sequence.length - KMER_LENGTH + 1; int additionalSequenceFromV2 = V2.actualSequence.length - KMER_LENGTH + 1;
byte[] newKmer = new byte[V1.sequence.length + additionalSequenceFromV2]; byte[] newKmer = new byte[V1.actualSequence.length + additionalSequenceFromV2];
System.arraycopy(V1.sequence, 0, newKmer, 0, V1.sequence.length); System.arraycopy(V1.actualSequence, 0, newKmer, 0, V1.actualSequence.length);
System.arraycopy(V2.sequence, KMER_LENGTH - 1, newKmer, V1.sequence.length, additionalSequenceFromV2); System.arraycopy(V2.actualSequence, KMER_LENGTH - 1, newKmer, V1.actualSequence.length, additionalSequenceFromV2);
DeBruijnVertex V12 = new DeBruijnVertex(newKmer); DeBruijnVertex V12 = new DeBruijnVertex(newKmer);
graph.addVertex(V12); graph.addVertex(V12);
@ -307,22 +315,34 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
private void cleanupNodeSequences() { private void cleanupNodeSequences() {
// remove the first k-1 bases of the kmers
for ( DeBruijnVertex v : graph.vertexSet() ) { for ( DeBruijnVertex v : graph.vertexSet() ) {
// remove the first k-1 bases of the kmers
if ( graph.inDegreeOf(v) > 0 ) if ( graph.inDegreeOf(v) > 0 )
removeKmerPrefix(v); v.removePrefix(KMER_LENGTH - 1, true);
// move common suffixes from incoming nodes to this one
if ( graph.inDegreeOf(v) > 1 ) {
Set<DeBruijnVertex> connectedVs = new HashSet<DeBruijnVertex>();
for ( DeBruijnEdge edge : graph.incomingEdgesOf(v) )
connectedVs.add(graph.getEdgeSource(edge));
propagateCommonSuffix(v, connectedVs);
}
} }
removeEmptyNodes(); // move common suffixes from incoming nodes to this one
while ( true ) {
boolean graphWasModified = false;
for ( DeBruijnVertex v : graph.vertexSet() ) {
if ( graph.inDegreeOf(v) > 1 ) {
Set<DeBruijnVertex> connectedVs = new HashSet<DeBruijnVertex>();
for ( DeBruijnEdge edge : graph.incomingEdgesOf(v) )
connectedVs.add(graph.getEdgeSource(edge));
if ( propagateCommonSuffix(v, connectedVs) ) {
removeEmptyNodes();
graphWasModified = true;
break;
}
}
}
if ( !graphWasModified )
break;
}
} }
private void removeEmptyNodes() { private void removeEmptyNodes() {
@ -332,26 +352,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
boolean graphWasModified = false; boolean graphWasModified = false;
for ( DeBruijnVertex v : graph.vertexSet() ) { for ( DeBruijnVertex v : graph.vertexSet() ) {
if ( v.sequence.length == 0 ) { if ( v.printableSequence.length == 0 ) {
removeNode(v);
Set<DeBruijnEdge> incoming = graph.incomingEdgesOf(v);
Set<DeBruijnEdge> outgoing = graph.outgoingEdgesOf(v);
// make edges from all incoming nodes to all outgoing nodes
for ( DeBruijnEdge Ex : incoming ) {
DeBruijnVertex Vx = graph.getEdgeSource(Ex);
for ( DeBruijnEdge Ey : outgoing ) {
DeBruijnVertex Vy = graph.getEdgeTarget(Ey);
DeBruijnEdge newEdge = new DeBruijnEdge();
newEdge.setMultiplicity(Ex.getMultiplicity());
graph.addEdge(Vx, Vy, newEdge);
}
}
// remove v and its associated edges
graph.removeVertex(v);
graphWasModified = true; graphWasModified = true;
break; break;
} }
@ -362,28 +364,41 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
} }
} }
private void removeKmerPrefix(DeBruijnVertex v) { private void removeNode(DeBruijnVertex v) {
int newLength = v.sequence.length - KMER_LENGTH + 1; Set<DeBruijnEdge> incoming = graph.incomingEdgesOf(v);
byte[] newSequence = new byte[newLength]; Set<DeBruijnEdge> outgoing = graph.outgoingEdgesOf(v);
System.arraycopy(v.sequence, KMER_LENGTH - 1, newSequence, 0, newLength);
v.sequence = newSequence; // make edges from all incoming nodes to all outgoing nodes
for ( DeBruijnEdge Ex : incoming ) {
DeBruijnVertex Vx = graph.getEdgeSource(Ex);
for ( DeBruijnEdge Ey : outgoing ) {
DeBruijnVertex Vy = graph.getEdgeTarget(Ey);
DeBruijnEdge newEdge = new DeBruijnEdge();
newEdge.setMultiplicity(Ex.getMultiplicity());
graph.addEdge(Vx, Vy, newEdge);
}
}
// remove v and its associated edges
graph.removeVertex(v);
} }
private void propagateCommonSuffix(DeBruijnVertex Vx, Set<DeBruijnVertex> incoming) { private boolean propagateCommonSuffix(DeBruijnVertex Vx, Set<DeBruijnVertex> incoming) {
// find the common matching suffix // find the common matching suffix
byte[] match = null; byte[] match = null;
for ( DeBruijnVertex v : incoming ) { for ( DeBruijnVertex v : incoming ) {
if ( match == null ) { if ( match == null ) {
match = v.sequence; match = v.printableSequence;
} else { } else {
int idx = 0; int idx = 0;
while ( idx < match.length && idx < v.sequence.length && match[match.length - idx - 1] == v.sequence[v.sequence.length - idx - 1] ) while ( idx < match.length && idx < v.printableSequence.length && match[match.length - idx - 1] == v.printableSequence[v.printableSequence.length - idx - 1] )
idx++; idx++;
if ( idx < match.length ) { if ( idx < match.length ) {
match = new byte[idx]; match = new byte[idx];
System.arraycopy(v.sequence, v.sequence.length - idx, match, 0, idx); System.arraycopy(v.printableSequence, v.printableSequence.length - idx, match, 0, idx);
} }
} }
} }
@ -391,28 +406,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
// if there is a common suffix... // if there is a common suffix...
if ( match != null && match.length > 0 ) { if ( match != null && match.length > 0 ) {
// remove it from the end of the incoming nodes // remove the suffix from the end of the incoming nodes...
for ( DeBruijnVertex v : incoming ) { for ( DeBruijnVertex v : incoming )
int newLength = v.sequence.length - match.length; v.removeSuffix(match.length, false);
byte[] newSequence = new byte[newLength];
System.arraycopy(v.sequence, 0, newSequence, 0, newLength);
v.sequence = newSequence;
}
// and put it at the front of this node // ...and put it at the front of this node
byte[] newSequence = new byte[Vx.sequence.length + match.length]; Vx.addPrefix(match, false);
System.arraycopy(match, 0, newSequence, 0, match.length); return true;
System.arraycopy(Vx.sequence, 0, newSequence, match.length, Vx.sequence.length);
Vx.sequence = newSequence;
} }
return false;
} }
private void printGraph() { private void printGraph() {
for ( DeBruijnVertex source : graph.vertexSet() ) { for ( DeBruijnVertex source : graph.vertexSet() ) {
getOutputStream().print(new String(source.sequence) + " -> "); getOutputStream().print(source + " -> ");
for ( DeBruijnEdge edge : graph.outgoingEdgesOf(source) ) { for ( DeBruijnEdge edge : graph.outgoingEdgesOf(source) ) {
getOutputStream().print(new String(graph.getEdgeTarget(edge).sequence) + " (" + edge.getMultiplicity() + "), "); getOutputStream().print(graph.getEdgeTarget(edge) + " (" + edge.getMultiplicity() + "), ");
} }
getOutputStream().println(); getOutputStream().println();
} }
@ -424,4 +435,21 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
// TODO -- implement me // TODO -- implement me
} }
/****
private void removeNodesWithLowMultiplicity() {
Set<DeBruijnVertex> vertexSet = graph.vertexSet();
// convert to array because results of the iteration on a set are undefined when the graph is modified
ArrayList<DeBruijnVertex> vertices = new ArrayList<DeBruijnVertex>(vertexSet);
for (int i = 0; i < vertices.size(); i++) {
DeBruijnVertex v = vertices.get(i);
if ( graph.inDegreeOf(v) == 1 &&
graph.incomingEdgesOf(v).iterator().next().getMultiplicity() < MIN_MULTIPLICITY_TO_USE )
removeNode(v);
}
}
****/
} }

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.assembly;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.*; import net.sf.samtools.*;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
@ -53,6 +54,10 @@ public class WindowedAssemblyWalker extends ReadWalker<SAMRecord, Integer> {
@Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false) @Argument(fullName = "assembler", shortName = "assembler", doc = "Assembler to use; currently only SIMPLE_DE_BRUIJN is available.", required = false)
protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN; protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN;
@Hidden
@Argument(fullName = "readsToUse", shortName = "readsToUse", doc = "For debugging: how many reads to use")
protected int numReadsToUse = -1;
// the assembly engine // the assembly engine
LocalAssemblyEngine assemblyEngine = null; LocalAssemblyEngine assemblyEngine = null;
@ -90,7 +95,7 @@ public class WindowedAssemblyWalker extends ReadWalker<SAMRecord, Integer> {
private LocalAssemblyEngine makeAssembler(LocalAssemblyEngine.ASSEMBLER type, IndexedFastaSequenceFile referenceReader) { private LocalAssemblyEngine makeAssembler(LocalAssemblyEngine.ASSEMBLER type, IndexedFastaSequenceFile referenceReader) {
switch ( type ) { switch ( type ) {
case SIMPLE_DE_BRUIJN: case SIMPLE_DE_BRUIJN:
return new SimpleDeBruijnAssembler(graphWriter, referenceReader); return new SimpleDeBruijnAssembler(graphWriter, referenceReader, numReadsToUse);
default: default:
throw new UserException.BadInput("Assembler type " + type + " is not valid/supported"); throw new UserException.BadInput("Assembler type " + type + " is not valid/supported");
} }