More fixes and improvements. We no longer use any bases under Q20 because random ~Q5s were cluttering the graphs; instead we grab any contiguous segments of size at least MIN_SEQUENCE_LENGTH where all bases are above Q20. Also, I implemented a quick algorithm to traverse the graph (using DFS) to choose the two best scoring paths (haplotypes). Used it successfully at NA12878 HM3 SNP sites to determine whether they are homozygous (no distiction yet between ref and alt) or heterozygous! Indels are the next target. Still have some issues to work out.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5502 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2011-03-24 03:51:19 +00:00
parent cd90fdeca1
commit 48b15d42e0
4 changed files with 196 additions and 70 deletions

View File

@ -0,0 +1,30 @@
package org.broadinstitute.sting.playground.gatk.walkers.assembly;
import org.jgrapht.graph.DefaultDirectedGraph;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Mar 23, 2011
*/
// simple edge class for connecting nodes in the graph
public class DeBruijnEdge {
private int multiplicity;
public DeBruijnEdge() {
multiplicity = 1;
}
public int getMultiplicity() {
return multiplicity;
}
public void setMultiplicity(int value) {
multiplicity = value;
}
public boolean equals(DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, DeBruijnEdge edge) {
return (graph.getEdgeSource(this) == graph.getEdgeSource(edge)) && (graph.getEdgeTarget(this) == graph.getEdgeTarget(edge));
}
}

View File

@ -0,0 +1,106 @@
package org.broadinstitute.sting.playground.gatk.walkers.assembly;
import org.jgrapht.graph.DefaultDirectedGraph;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Mar 23, 2011
*/
// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph.
// This is different from most graph traversals because we want to test paths from any source node to any sink node.
public class KBestPaths {
// static access only
protected KBestPaths() { }
// class to keep track of paths
protected static class Path {
// the last vertex seen in the path
private DeBruijnVertex lastVertex;
// the list of edges comprising the path
private List<DeBruijnEdge> edges;
// the scores for the path
private int totalScore = 0, lowestEdge = -1;
public Path(DeBruijnVertex initialVertex) {
lastVertex = initialVertex;
edges = new ArrayList<DeBruijnEdge>(0);
}
public Path(Path p, DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, DeBruijnEdge edge) {
lastVertex = graph.getEdgeTarget(edge);
edges = new ArrayList<DeBruijnEdge>(p.edges);
edges.add(edge);
totalScore = p.totalScore + edge.getMultiplicity();
lowestEdge = ( p.lowestEdge == -1 ) ? edge.getMultiplicity() : Math.min(p.lowestEdge, edge.getMultiplicity());
}
public boolean containsEdge(DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, DeBruijnEdge edge) {
for ( DeBruijnEdge e : edges ) {
if ( e.equals(graph, edge))
return true;
}
return false;
}
public List<DeBruijnEdge> getEdges() { return edges; }
public int getScore() { return totalScore; }
public int getLowestEdge() { return lowestEdge; }
public DeBruijnVertex getLastVertexInPath() { return lastVertex; }
}
protected static class PathComparator implements Comparator<Path> {
public int compare(final Path path1, final Path path2) {
return path1.totalScore - path2.totalScore;
}
}
public static List<Path> getKBestPaths(DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, int k) {
PriorityQueue<Path> bestPaths = new PriorityQueue<Path>(k, new PathComparator());
// run a DFS for best paths
for ( DeBruijnVertex v : graph.vertexSet() ) {
if ( graph.inDegreeOf(v) == 0 )
findBestPaths(graph, new Path(v), k, bestPaths);
}
return new ArrayList<Path>(bestPaths);
}
private static void findBestPaths(DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, Path path, int k, PriorityQueue<Path> bestPaths) {
// did we hit the end of a path?
if ( graph.outDegreeOf(path.lastVertex) == 0 ) {
if ( bestPaths.size() < k ) {
bestPaths.add(path);
} else if ( bestPaths.peek().totalScore < path.totalScore ) {
bestPaths.remove();
bestPaths.add(path);
}
return;
}
// recursively run DFS
for ( DeBruijnEdge edge : graph.outgoingEdgesOf(path.lastVertex) ) {
// make sure the edge is not already in the path
if ( path.containsEdge(graph, edge) )
continue;
Path newPath = new Path(path, graph, edge);
findBestPaths(graph, newPath, k, bestPaths);
}
}
}

View File

@ -17,13 +17,13 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
private static final boolean DEBUG = true;
// k-mer length
private static final int KMER_LENGTH = 19;
private static final int KMER_LENGTH = 9;
// minimum base quality required in a contiguous stretch of a given read to be used in the assembly
private static final int MIN_BASE_QUAL_TO_USE = 20;
// minimum clipped sequence length to consider using
private static final int MIN_SEQUENCE_LENGTH = 30;
private static final int MIN_SEQUENCE_LENGTH = 25;
// minimum multiplicity to consider using
// private static final int MIN_MULTIPLICITY_TO_USE = 2;
@ -34,23 +34,6 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
// the deBruijn graph object
private DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = null;
// simple edge class for connecting nodes in the graph
protected class DeBruijnEdge {
private int multiplicity;
public DeBruijnEdge() {
multiplicity = 1;
}
public int getMultiplicity() {
return multiplicity;
}
public void setMultiplicity(int value) {
multiplicity = value;
}
}
public SimpleDeBruijnAssembler(PrintStream out, IndexedFastaSequenceFile referenceReader, int numReadsToUse) {
super(out, referenceReader);
@ -68,12 +51,15 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
// create the graph
createDeBruijnGraph(sequences);
// find the 2 best paths in the graph
findBestPaths();
// assign reads to the graph
assignReadsToGraph(sequences);
}
// This method takes the base sequences from the SAM records and clips both ends so that
// soft-clipped bases - plus all bases until the first Q20 (on either end) - are removed.
// This method takes the base sequences from the SAM records and pulls
// out runs of bases that are not soft-clipped and are all at least Q20s.
// Clipped sequences that are overly clipped are not used.
private List<byte[]> clipReads(List<SAMRecord> reads) {
List<byte[]> sequences = new ArrayList<byte[]>(reads.size());
@ -91,73 +77,48 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
byte[] sequencedReadBases = read.getReadBases();
byte[] sequencedBaseQuals = read.getBaseQualities();
int fromIndex = 0;
boolean sawQ20 = false;
int curIndex = 0, firstQ20Index = -1;
for ( CigarElement ce : read.getCigar().getCigarElements() ) {
if ( sawQ20 )
break;
int elementLength = ce.getLength();
switch ( ce.getOperator() ) {
case S:
// skip soft-clipped bases
fromIndex += elementLength;
curIndex += elementLength;
break;
case M:
case I:
for (int i = 0; i < elementLength; i++) {
if ( sequencedBaseQuals[fromIndex] >= MIN_BASE_QUAL_TO_USE ) {
sawQ20 = true;
break;
if ( sequencedBaseQuals[curIndex] >= MIN_BASE_QUAL_TO_USE ) {
if ( firstQ20Index == -1 )
firstQ20Index = curIndex;
} else if ( firstQ20Index != -1 ) {
int sequenceLength = curIndex - firstQ20Index;
if ( sequenceLength > MIN_SEQUENCE_LENGTH ) {
byte[] sequence = new byte[sequenceLength];
System.arraycopy(sequencedReadBases, firstQ20Index, sequence, 0, sequenceLength);
sequences.add(sequence);
}
firstQ20Index = -1;
}
fromIndex++;
curIndex++;
}
case N:
// TODO -- implement me (cut the sequence)
default:
break;
}
}
int toIndex = sequencedReadBases.length - 1;
sawQ20 = false;
for (int ceIdx = read.getCigar().numCigarElements() - 1; ceIdx >= 0; ceIdx--) {
if ( sawQ20 )
break;
CigarElement ce = read.getCigar().getCigarElement(ceIdx);
int elementLength = ce.getLength();
switch ( ce.getOperator() ) {
case S:
// skip soft-clipped bases
toIndex -= elementLength;
break;
case M:
case I:
for (int i = 0; i < elementLength; i++) {
if ( sequencedBaseQuals[toIndex] >= MIN_BASE_QUAL_TO_USE ) {
sawQ20 = true;
break;
}
toIndex--;
}
default:
break;
if ( firstQ20Index != -1 ) {
int sequenceLength = curIndex - firstQ20Index;
if ( sequenceLength > MIN_SEQUENCE_LENGTH ) {
byte[] sequence = new byte[sequenceLength];
System.arraycopy(sequencedReadBases, firstQ20Index, sequence, 0, sequenceLength);
sequences.add(sequence);
}
}
int sequenceLength = toIndex - fromIndex + 1;
if ( sequenceLength < MIN_SEQUENCE_LENGTH ) {
if ( DEBUG ) System.out.println("Read " + read.getReadName() + " is overly clipped");
continue;
}
//if ( DEBUG ) {
// System.out.println("Read " + read.getReadName() + ": " + read.getCigar());
// System.out.println("Clipping from " + fromIndex + " to " + toIndex);
//}
byte[] sequence = new byte[sequenceLength];
System.arraycopy(sequencedReadBases, fromIndex, sequence, 0, sequenceLength);
sequences.add(sequence);
}
return sequences;
@ -322,6 +283,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
}
// move common suffixes from incoming nodes to this one
while ( true ) {
boolean graphWasModified = false;
@ -421,6 +383,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
private void printGraph() {
for ( DeBruijnVertex source : graph.vertexSet() ) {
if ( graph.inDegreeOf(source) == 0 )
getOutputStream().print("* ");
getOutputStream().print(source + " -> ");
for ( DeBruijnEdge edge : graph.outgoingEdgesOf(source) ) {
getOutputStream().print(graph.getEdgeTarget(edge) + " (" + edge.getMultiplicity() + "), ");
@ -430,6 +394,32 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
getOutputStream().println("------------\n");
}
private void findBestPaths() {
// find them
List<KBestPaths.Path> bestPaths = KBestPaths.getKBestPaths(graph, 2);
// print them out
for ( KBestPaths.Path path : bestPaths ) {
List<DeBruijnEdge> edges = path.getEdges();
for (int i = 0; i < edges.size(); i++) {
DeBruijnEdge edge = edges.get(i);
if ( i == 0 )
getOutputStream().print(graph.getEdgeSource(edge));
getOutputStream().print(graph.getEdgeTarget(edge));
}
if ( edges.size() == 0 )
getOutputStream().print(path.getLastVertexInPath());
getOutputStream().println(" (score=" + path.getScore() + ", lowestEdge=" + path.getLowestEdge() +")");
}
}
private void assignReadsToGraph(List<byte[]> reads) {
// TODO -- implement me

View File

@ -55,7 +55,7 @@ public class WindowedAssemblyWalker extends ReadWalker<SAMRecord, Integer> {
protected LocalAssemblyEngine.ASSEMBLER ASSEMBLER_TO_USE = LocalAssemblyEngine.ASSEMBLER.SIMPLE_DE_BRUIJN;
@Hidden
@Argument(fullName = "readsToUse", shortName = "readsToUse", doc = "For debugging: how many reads to use")
@Argument(fullName = "readsToUse", shortName = "readsToUse", doc = "For debugging: how many reads to use", required = false)
protected int numReadsToUse = -1;
// the assembly engine