Merge pull request #256 from broadinstitute/md_hc_assembly_bug

Bugfix for HC can fail to assemble the correct reference sequence in som...
This commit is contained in:
Mark DePristo 2013-06-03 13:58:41 -07:00
commit a0817b696b
7 changed files with 147 additions and 29 deletions

View File

@ -50,6 +50,7 @@ import com.google.java.contract.Ensures;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileWriter;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
@ -387,6 +388,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
protected boolean dontUseSoftClippedBases = false;
@Hidden
@Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false)
protected boolean captureAssemblyFailureBAM = false;
@Hidden
@Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false)
protected boolean allowCyclesInKmerGraphToGeneratePaths = false;
@ -751,13 +756,24 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc);
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
if ( ! dontTrimActiveRegions ) {
return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc);
} else {
// we don't want to trim active regions, so go ahead and use the old one
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
try {
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
if ( ! dontTrimActiveRegions ) {
return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc);
} else {
// we don't want to trim active regions, so go ahead and use the old one
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
}
} catch ( Exception e ) {
// Capture any exception that might be thrown, and write out the assembly failure BAM if requested
if ( captureAssemblyFailureBAM ) {
final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5);
for ( final GATKSAMRecord read : activeRegion.getReads() ) {
writer.addAlignment(read);
}
writer.close();
}
throw e;
}
}

View File

@ -78,6 +78,10 @@ import java.util.*;
public abstract class LocalAssemblyEngine {
private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class);
/**
* If false, we will only write out a region around the reference source
*/
private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true;
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30;
@ -252,20 +256,26 @@ public abstract class LocalAssemblyEngine {
return false;
}
protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
/**
* Print graph to file if debugGraphTransformations is enabled
* @param graph the graph to print
* @param file the destination file
*/
protected void printDebugGraphTransform(final BaseGraph graph, final File file) {
if ( debugGraphTransformations ) {
if ( PRINT_FULL_GRAPH_FOR_DEBUGGING )
graph.printGraph(file, pruneFactor);
else
graph.subsetToRefSource().printGraph(file, pruneFactor);
}
}
protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot"));
// TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm
// TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect
// TODO -- to anything from one that's actually has good support along the chain but just happens
// TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately
// TODO -- the pruning algorithm really should be an error correction algorithm that knows more
// TODO -- about the structure of the data and can differentiate between an infrequent path but
// TODO -- without evidence against it (such as occurs when a region is hard to get any reads through)
// TODO -- from a error with lots of weight going along another similar path
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
seqGraph.zipLinearChains();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot"));
// now go through and prune the graph, removing vertices no longer connected to the reference chain
// IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
@ -273,9 +283,9 @@ public abstract class LocalAssemblyEngine {
seqGraph.pruneGraph(pruneFactor);
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot"));
seqGraph.simplifyGraph();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
printDebugGraphTransform(seqGraph, new File("sequenceGraph.4.merged.dot"));
// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
// happen in cases where for example the reference somehow manages to acquire a cycle, or
@ -294,7 +304,7 @@ public abstract class LocalAssemblyEngine {
seqGraph.addVertex(dummy);
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
}
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot"));
return seqGraph;
}

View File

@ -388,6 +388,17 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
return s;
}
/**
* Get the set of vertices connected to v by incoming or outgoing edges
* @param v a non-null vertex
* @return a set of vertices {X} connected X -> v or v -> Y
*/
public Set<V> neighboringVerticesOf(final V v) {
final Set<V> s = incomingVerticesOf(v);
s.addAll(outgoingVerticesOf(v));
return s;
}
/**
* Print out the graph in the dot language for visualization
* @param destination File to write to
@ -664,4 +675,54 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
"kmerSize=" + kmerSize +
'}';
}
/**
* Get the set of vertices within distance edges of source, regardless of edge direction
*
* @param source the source vertex to consider
* @param distance the distance
* @return a set of vertices within distance of source
*/
protected Set<V> verticesWithinDistance(final V source, final int distance) {
if ( distance == 0 )
return Collections.singleton(source);
final Set<V> found = new HashSet<>();
found.add(source);
for ( final V v : neighboringVerticesOf(source) ) {
found.addAll(verticesWithinDistance(v, distance - 1));
}
return found;
}
/**
* Get a graph containing only the vertices within distance edges of target
* @param target a vertex in graph
* @param distance the max distance
* @return a non-null graph
*/
public BaseGraph<V,E> subsetToNeighbors(final V target, final int distance) {
if ( target == null ) throw new IllegalArgumentException("Target cannot be null");
if ( ! containsVertex(target) ) throw new IllegalArgumentException("Graph doesn't contain vertex " + target);
if ( distance < 0 ) throw new IllegalArgumentException("Distance must be >= 0 but got " + distance);
final Set<V> toKeep = verticesWithinDistance(target, distance);
final Set<V> toRemove = new HashSet<>(vertexSet());
toRemove.removeAll(toKeep);
final BaseGraph<V,E> result = (BaseGraph<V,E>)clone();
result.removeAllVertices(toRemove);
return result;
}
/**
* Get a subgraph of graph that contains only vertices within 10 edges of the ref source vertex
* @return a non-null subgraph of this graph
*/
public BaseGraph<V,E> subsetToRefSource() {
return subsetToNeighbors(getReferenceSourceVertex(), 10);
}
}

View File

@ -155,20 +155,29 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
//logger.info("simplifyGraph iteration " + i);
// iterate until we haven't don't anything useful
boolean didSomeWork = false;
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0);
printGraphSimplification(new File("simplifyGraph." + iteration + ".1.dot"));
didSomeWork |= new MergeDiamonds().transformUntilComplete();
didSomeWork |= new MergeTails().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0);
printGraphSimplification(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"));
didSomeWork |= new SplitCommonSuffices().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0);
printGraphSimplification(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"));
didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0);
printGraphSimplification(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"));
didSomeWork |= zipLinearChains();
return didSomeWork;
}
/**
* Print simplication step of this graph, if PRINT_SIMPLIFY_GRAPHS is enabled
* @param file the destination for the graph DOT file
*/
private void printGraphSimplification(final File file) {
if ( PRINT_SIMPLIFY_GRAPHS )
subsetToNeighbors(getReferenceSourceVertex(), 5).printGraph(file, 0);
}
/**
* Zip up all of the simple linear chains present in this graph.
*

View File

@ -81,7 +81,7 @@ public class SharedSequenceMerger {
else {
// graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0);
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
final List<BaseEdge> edgesToRemove = new LinkedList<>();
final byte[] prevSeq = prevs.iterator().next().getSequence();
final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence()));
graph.addVertex(newV);
@ -124,11 +124,17 @@ public class SharedSequenceMerger {
final SeqVertex first = incomingVertices.iterator().next();
for ( final SeqVertex prev : incomingVertices) {
if ( ! prev.seqEquals(first) )
// cannot merge if our sequence isn't the same as the first sequence
return false;
final Collection<SeqVertex> prevOuts = graph.outgoingVerticesOf(prev);
if ( prevOuts.size() != 1 )
// prev -> v must be the only edge from prev
return false;
if ( prevOuts.iterator().next() != v )
// don't allow cyles
return false;
if ( graph.inDegreeOf(prev) == 0 )
// cannot merge when any of the incoming nodes are sources
return false;
}

View File

@ -113,7 +113,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
// actually build the read threading graph
rtgraph.buildGraphIfNecessary();
if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.0.raw_readthreading_graph.dot"), pruneFactor);
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
@ -128,7 +128,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
// remove all heading and trailing paths
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"), pruneFactor);
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
@ -136,7 +136,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph);
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
if ( debugGraphTransformations ) initialSeqGraph.printGraph(new File("sequenceGraph.0.2.initial_seqgraph.dot"), pruneFactor);
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);

View File

@ -166,4 +166,20 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
splitter.merge(data.graph, data.v);
assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original);
}
@Test
public void testDoesntMergeSourceNodes() {
final SeqGraph g = new SeqGraph();
final SeqVertex v1 = new SeqVertex("A");
final SeqVertex v2 = new SeqVertex("A");
final SeqVertex v3 = new SeqVertex("A");
final SeqVertex top = new SeqVertex("T");
final SeqVertex b = new SeqVertex("C");
g.addVertices(top, v1, v2, v3, top, b);
g.addEdges(top, v1, b);
g.addEdges(v2, b); // v2 doesn't have previous node, cannot be merged
g.addEdges(top, v3, b);
final SharedSequenceMerger merger = new SharedSequenceMerger();
Assert.assertFalse(merger.merge(g, b), "Shouldn't be able to merge shared vertices, when one is a source");
}
}