diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnEdge.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnEdge.java new file mode 100755 index 000000000..115041879 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/DeBruijnEdge.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.playground.gatk.walkers.assembly; + +import org.jgrapht.graph.DefaultDirectedGraph; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Mar 23, 2011 + */ +// simple edge class for connecting nodes in the graph +public class DeBruijnEdge { + + private int multiplicity; + + public DeBruijnEdge() { + multiplicity = 1; + } + + public int getMultiplicity() { + return multiplicity; + } + + public void setMultiplicity(int value) { + multiplicity = value; + } + + public boolean equals(DefaultDirectedGraph graph, DeBruijnEdge edge) { + return (graph.getEdgeSource(this) == graph.getEdgeSource(edge)) && (graph.getEdgeTarget(this) == graph.getEdgeTarget(edge)); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/KBestPaths.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/KBestPaths.java new file mode 100755 index 000000000..f94a30539 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/KBestPaths.java @@ -0,0 +1,106 @@ +package org.broadinstitute.sting.playground.gatk.walkers.assembly; + +import org.jgrapht.graph.DefaultDirectedGraph; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Mar 23, 2011 + */ +// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. +// This is different from most graph traversals because we want to test paths from any source node to any sink node. +public class KBestPaths { + + // static access only + protected KBestPaths() { } + + // class to keep track of paths + protected static class Path { + + // the last vertex seen in the path + private DeBruijnVertex lastVertex; + + // the list of edges comprising the path + private List edges; + + // the scores for the path + private int totalScore = 0, lowestEdge = -1; + + public Path(DeBruijnVertex initialVertex) { + lastVertex = initialVertex; + edges = new ArrayList(0); + } + + public Path(Path p, DefaultDirectedGraph graph, DeBruijnEdge edge) { + lastVertex = graph.getEdgeTarget(edge); + edges = new ArrayList(p.edges); + edges.add(edge); + totalScore = p.totalScore + edge.getMultiplicity(); + lowestEdge = ( p.lowestEdge == -1 ) ? edge.getMultiplicity() : Math.min(p.lowestEdge, edge.getMultiplicity()); + } + + public boolean containsEdge(DefaultDirectedGraph graph, DeBruijnEdge edge) { + for ( DeBruijnEdge e : edges ) { + if ( e.equals(graph, edge)) + return true; + } + + return false; + } + + public List getEdges() { return edges; } + + public int getScore() { return totalScore; } + + public int getLowestEdge() { return lowestEdge; } + + public DeBruijnVertex getLastVertexInPath() { return lastVertex; } + } + + protected static class PathComparator implements Comparator { + public int compare(final Path path1, final Path path2) { + return path1.totalScore - path2.totalScore; + } + } + + public static List getKBestPaths(DefaultDirectedGraph graph, int k) { + PriorityQueue bestPaths = new PriorityQueue(k, new PathComparator()); + + // run a DFS for best paths + for ( DeBruijnVertex v : graph.vertexSet() ) { + if ( graph.inDegreeOf(v) == 0 ) + findBestPaths(graph, new Path(v), k, bestPaths); + } + + return new ArrayList(bestPaths); + } + + private static void findBestPaths(DefaultDirectedGraph graph, Path path, int k, PriorityQueue bestPaths) { + + // did we hit the end of a path? + if ( graph.outDegreeOf(path.lastVertex) == 0 ) { + if ( bestPaths.size() < k ) { + bestPaths.add(path); + } else if ( bestPaths.peek().totalScore < path.totalScore ) { + bestPaths.remove(); + bestPaths.add(path); + } + + return; + } + + // recursively run DFS + for ( DeBruijnEdge edge : graph.outgoingEdgesOf(path.lastVertex) ) { + + // make sure the edge is not already in the path + if ( path.containsEdge(graph, edge) ) + continue; + + Path newPath = new Path(path, graph, edge); + findBestPaths(graph, newPath, k, bestPaths); + } + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java index 9359e4389..60019e2b1 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/SimpleDeBruijnAssembler.java @@ -17,13 +17,13 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { private static final boolean DEBUG = true; // k-mer length - private static final int KMER_LENGTH = 19; + private static final int KMER_LENGTH = 9; // minimum base quality required in a contiguous stretch of a given read to be used in the assembly private static final int MIN_BASE_QUAL_TO_USE = 20; // minimum clipped sequence length to consider using - private static final int MIN_SEQUENCE_LENGTH = 30; + private static final int MIN_SEQUENCE_LENGTH = 25; // minimum multiplicity to consider using // private static final int MIN_MULTIPLICITY_TO_USE = 2; @@ -34,23 +34,6 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // the deBruijn graph object private DefaultDirectedGraph graph = null; - // simple edge class for connecting nodes in the graph - protected class DeBruijnEdge { - private int multiplicity; - - public DeBruijnEdge() { - multiplicity = 1; - } - - public int getMultiplicity() { - return multiplicity; - } - - public void setMultiplicity(int value) { - multiplicity = value; - } - } - public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader, int numReadsToUse) { super(out, referenceReader); @@ -68,12 +51,15 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // create the graph createDeBruijnGraph(sequences); + // find the 2 best paths in the graph + findBestPaths(); + // assign reads to the graph assignReadsToGraph(sequences); } - // This method takes the base sequences from the SAM records and clips both ends so that - // soft-clipped bases - plus all bases until the first Q20 (on either end) - are removed. + // This method takes the base sequences from the SAM records and pulls + // out runs of bases that are not soft-clipped and are all at least Q20s. // Clipped sequences that are overly clipped are not used. private List clipReads(List reads) { List sequences = new ArrayList(reads.size()); @@ -91,73 +77,48 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { byte[] sequencedReadBases = read.getReadBases(); byte[] sequencedBaseQuals = read.getBaseQualities(); - int fromIndex = 0; - boolean sawQ20 = false; + int curIndex = 0, firstQ20Index = -1; + for ( CigarElement ce : read.getCigar().getCigarElements() ) { - if ( sawQ20 ) - break; int elementLength = ce.getLength(); switch ( ce.getOperator() ) { case S: // skip soft-clipped bases - fromIndex += elementLength; + curIndex += elementLength; break; case M: case I: for (int i = 0; i < elementLength; i++) { - if ( sequencedBaseQuals[fromIndex] >= MIN_BASE_QUAL_TO_USE ) { - sawQ20 = true; - break; + if ( sequencedBaseQuals[curIndex] >= MIN_BASE_QUAL_TO_USE ) { + if ( firstQ20Index == -1 ) + firstQ20Index = curIndex; + } else if ( firstQ20Index != -1 ) { + int sequenceLength = curIndex - firstQ20Index; + if ( sequenceLength > MIN_SEQUENCE_LENGTH ) { + byte[] sequence = new byte[sequenceLength]; + System.arraycopy(sequencedReadBases, firstQ20Index, sequence, 0, sequenceLength); + sequences.add(sequence); + } + firstQ20Index = -1; } - fromIndex++; + curIndex++; } + case N: + // TODO -- implement me (cut the sequence) default: break; } } - int toIndex = sequencedReadBases.length - 1; - sawQ20 = false; - for (int ceIdx = read.getCigar().numCigarElements() - 1; ceIdx >= 0; ceIdx--) { - if ( sawQ20 ) - break; - - CigarElement ce = read.getCigar().getCigarElement(ceIdx); - int elementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case S: - // skip soft-clipped bases - toIndex -= elementLength; - break; - case M: - case I: - for (int i = 0; i < elementLength; i++) { - if ( sequencedBaseQuals[toIndex] >= MIN_BASE_QUAL_TO_USE ) { - sawQ20 = true; - break; - } - toIndex--; - } - default: - break; + if ( firstQ20Index != -1 ) { + int sequenceLength = curIndex - firstQ20Index; + if ( sequenceLength > MIN_SEQUENCE_LENGTH ) { + byte[] sequence = new byte[sequenceLength]; + System.arraycopy(sequencedReadBases, firstQ20Index, sequence, 0, sequenceLength); + sequences.add(sequence); } } - - int sequenceLength = toIndex - fromIndex + 1; - if ( sequenceLength < MIN_SEQUENCE_LENGTH ) { - if ( DEBUG ) System.out.println("Read " + read.getReadName() + " is overly clipped"); - continue; - } - - //if ( DEBUG ) { - // System.out.println("Read " + read.getReadName() + ": " + read.getCigar()); - // System.out.println("Clipping from " + fromIndex + " to " + toIndex); - //} - - byte[] sequence = new byte[sequenceLength]; - System.arraycopy(sequencedReadBases, fromIndex, sequence, 0, sequenceLength); - sequences.add(sequence); } return sequences; @@ -322,6 +283,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } // move common suffixes from incoming nodes to this one + while ( true ) { boolean graphWasModified = false; @@ -421,6 +383,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { private void printGraph() { for ( DeBruijnVertex source : graph.vertexSet() ) { + if ( graph.inDegreeOf(source) == 0 ) + getOutputStream().print("* "); getOutputStream().print(source + " -> "); for ( DeBruijnEdge edge : graph.outgoingEdgesOf(source) ) { getOutputStream().print(graph.getEdgeTarget(edge) + " (" + edge.getMultiplicity() + "), "); @@ -430,6 +394,32 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { getOutputStream().println("------------\n"); } + private void findBestPaths() { + + // find them + List bestPaths = KBestPaths.getKBestPaths(graph, 2); + + // print them out + for ( KBestPaths.Path path : bestPaths ) { + + List edges = path.getEdges(); + for (int i = 0; i < edges.size(); i++) { + + DeBruijnEdge edge = edges.get(i); + + if ( i == 0 ) + getOutputStream().print(graph.getEdgeSource(edge)); + + getOutputStream().print(graph.getEdgeTarget(edge)); + } + + if ( edges.size() == 0 ) + getOutputStream().print(path.getLastVertexInPath()); + + getOutputStream().println(" (score=" + path.getScore() + ", lowestEdge=" + path.getLowestEdge() +")"); + } + } + private void assignReadsToGraph(List reads) { // TODO -- implement me diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java index bd5acd269..080e19bea 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/assembly/WindowedAssemblyWalker.java @@ -55,7 +55,7 @@ public class WindowedAssemblyWalker extends ReadWalker { protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN; @Hidden - @Argument(fullName = "readsToUse", shortName = "readsToUse", doc = "For debugging: how many reads to use") + @Argument(fullName = "readsToUse", shortName = "readsToUse", doc = "For debugging: how many reads to use", required = false) protected int numReadsToUse = -1; // the assembly engine