Merge pull request #256 from broadinstitute/md_hc_assembly_bug
Bugfix for HC can fail to assemble the correct reference sequence in som...
This commit is contained in:
commit
a0817b696b
|
|
@ -50,6 +50,7 @@ import com.google.java.contract.Ensures;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
|
|
@ -387,6 +388,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
|
||||
protected boolean dontUseSoftClippedBases = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false)
|
||||
protected boolean captureAssemblyFailureBAM = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false)
|
||||
protected boolean allowCyclesInKmerGraphToGeneratePaths = false;
|
||||
|
|
@ -751,13 +756,24 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
|
||||
final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc);
|
||||
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
|
||||
|
||||
if ( ! dontTrimActiveRegions ) {
|
||||
return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
} else {
|
||||
// we don't want to trim active regions, so go ahead and use the old one
|
||||
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
|
||||
try {
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
|
||||
if ( ! dontTrimActiveRegions ) {
|
||||
return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
} else {
|
||||
// we don't want to trim active regions, so go ahead and use the old one
|
||||
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
|
||||
}
|
||||
} catch ( Exception e ) {
|
||||
// Capture any exception that might be thrown, and write out the assembly failure BAM if requested
|
||||
if ( captureAssemblyFailureBAM ) {
|
||||
final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5);
|
||||
for ( final GATKSAMRecord read : activeRegion.getReads() ) {
|
||||
writer.addAlignment(read);
|
||||
}
|
||||
writer.close();
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -78,6 +78,10 @@ import java.util.*;
|
|||
public abstract class LocalAssemblyEngine {
|
||||
private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class);
|
||||
|
||||
/**
|
||||
* If false, we will only write out a region around the reference source
|
||||
*/
|
||||
private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true;
|
||||
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
|
||||
private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30;
|
||||
|
||||
|
|
@ -252,20 +256,26 @@ public abstract class LocalAssemblyEngine {
|
|||
return false;
|
||||
}
|
||||
|
||||
protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
|
||||
/**
|
||||
* Print graph to file if debugGraphTransformations is enabled
|
||||
* @param graph the graph to print
|
||||
* @param file the destination file
|
||||
*/
|
||||
protected void printDebugGraphTransform(final BaseGraph graph, final File file) {
|
||||
if ( debugGraphTransformations ) {
|
||||
if ( PRINT_FULL_GRAPH_FOR_DEBUGGING )
|
||||
graph.printGraph(file, pruneFactor);
|
||||
else
|
||||
graph.subsetToRefSource().printGraph(file, pruneFactor);
|
||||
}
|
||||
}
|
||||
|
||||
protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot"));
|
||||
|
||||
// TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm
|
||||
// TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect
|
||||
// TODO -- to anything from one that's actually has good support along the chain but just happens
|
||||
// TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately
|
||||
// TODO -- the pruning algorithm really should be an error correction algorithm that knows more
|
||||
// TODO -- about the structure of the data and can differentiate between an infrequent path but
|
||||
// TODO -- without evidence against it (such as occurs when a region is hard to get any reads through)
|
||||
// TODO -- from a error with lots of weight going along another similar path
|
||||
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
|
||||
seqGraph.zipLinearChains();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot"));
|
||||
|
||||
// now go through and prune the graph, removing vertices no longer connected to the reference chain
|
||||
// IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
|
||||
|
|
@ -273,9 +283,9 @@ public abstract class LocalAssemblyEngine {
|
|||
seqGraph.pruneGraph(pruneFactor);
|
||||
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
|
||||
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot"));
|
||||
seqGraph.simplifyGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.4.merged.dot"));
|
||||
|
||||
// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
|
||||
// happen in cases where for example the reference somehow manages to acquire a cycle, or
|
||||
|
|
@ -294,7 +304,7 @@ public abstract class LocalAssemblyEngine {
|
|||
seqGraph.addVertex(dummy);
|
||||
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
|
||||
}
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot"));
|
||||
|
||||
return seqGraph;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -388,6 +388,17 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of vertices connected to v by incoming or outgoing edges
|
||||
* @param v a non-null vertex
|
||||
* @return a set of vertices {X} connected X -> v or v -> Y
|
||||
*/
|
||||
public Set<V> neighboringVerticesOf(final V v) {
|
||||
final Set<V> s = incomingVerticesOf(v);
|
||||
s.addAll(outgoingVerticesOf(v));
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print out the graph in the dot language for visualization
|
||||
* @param destination File to write to
|
||||
|
|
@ -664,4 +675,54 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
"kmerSize=" + kmerSize +
|
||||
'}';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of vertices within distance edges of source, regardless of edge direction
|
||||
*
|
||||
* @param source the source vertex to consider
|
||||
* @param distance the distance
|
||||
* @return a set of vertices within distance of source
|
||||
*/
|
||||
protected Set<V> verticesWithinDistance(final V source, final int distance) {
|
||||
if ( distance == 0 )
|
||||
return Collections.singleton(source);
|
||||
|
||||
final Set<V> found = new HashSet<>();
|
||||
found.add(source);
|
||||
for ( final V v : neighboringVerticesOf(source) ) {
|
||||
found.addAll(verticesWithinDistance(v, distance - 1));
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a graph containing only the vertices within distance edges of target
|
||||
* @param target a vertex in graph
|
||||
* @param distance the max distance
|
||||
* @return a non-null graph
|
||||
*/
|
||||
public BaseGraph<V,E> subsetToNeighbors(final V target, final int distance) {
|
||||
if ( target == null ) throw new IllegalArgumentException("Target cannot be null");
|
||||
if ( ! containsVertex(target) ) throw new IllegalArgumentException("Graph doesn't contain vertex " + target);
|
||||
if ( distance < 0 ) throw new IllegalArgumentException("Distance must be >= 0 but got " + distance);
|
||||
|
||||
|
||||
final Set<V> toKeep = verticesWithinDistance(target, distance);
|
||||
final Set<V> toRemove = new HashSet<>(vertexSet());
|
||||
toRemove.removeAll(toKeep);
|
||||
|
||||
final BaseGraph<V,E> result = (BaseGraph<V,E>)clone();
|
||||
result.removeAllVertices(toRemove);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a subgraph of graph that contains only vertices within 10 edges of the ref source vertex
|
||||
* @return a non-null subgraph of this graph
|
||||
*/
|
||||
public BaseGraph<V,E> subsetToRefSource() {
|
||||
return subsetToNeighbors(getReferenceSourceVertex(), 10);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -155,20 +155,29 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
//logger.info("simplifyGraph iteration " + i);
|
||||
// iterate until we haven't don't anything useful
|
||||
boolean didSomeWork = false;
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0);
|
||||
printGraphSimplification(new File("simplifyGraph." + iteration + ".1.dot"));
|
||||
didSomeWork |= new MergeDiamonds().transformUntilComplete();
|
||||
didSomeWork |= new MergeTails().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0);
|
||||
printGraphSimplification(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"));
|
||||
|
||||
didSomeWork |= new SplitCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0);
|
||||
printGraphSimplification(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"));
|
||||
didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0);
|
||||
printGraphSimplification(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"));
|
||||
|
||||
didSomeWork |= zipLinearChains();
|
||||
return didSomeWork;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print simplication step of this graph, if PRINT_SIMPLIFY_GRAPHS is enabled
|
||||
* @param file the destination for the graph DOT file
|
||||
*/
|
||||
private void printGraphSimplification(final File file) {
|
||||
if ( PRINT_SIMPLIFY_GRAPHS )
|
||||
subsetToNeighbors(getReferenceSourceVertex(), 5).printGraph(file, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Zip up all of the simple linear chains present in this graph.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ public class SharedSequenceMerger {
|
|||
else {
|
||||
// graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0);
|
||||
|
||||
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
|
||||
final List<BaseEdge> edgesToRemove = new LinkedList<>();
|
||||
final byte[] prevSeq = prevs.iterator().next().getSequence();
|
||||
final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence()));
|
||||
graph.addVertex(newV);
|
||||
|
|
@ -124,11 +124,17 @@ public class SharedSequenceMerger {
|
|||
final SeqVertex first = incomingVertices.iterator().next();
|
||||
for ( final SeqVertex prev : incomingVertices) {
|
||||
if ( ! prev.seqEquals(first) )
|
||||
// cannot merge if our sequence isn't the same as the first sequence
|
||||
return false;
|
||||
final Collection<SeqVertex> prevOuts = graph.outgoingVerticesOf(prev);
|
||||
if ( prevOuts.size() != 1 )
|
||||
// prev -> v must be the only edge from prev
|
||||
return false;
|
||||
if ( prevOuts.iterator().next() != v )
|
||||
// don't allow cyles
|
||||
return false;
|
||||
if ( graph.inDegreeOf(prev) == 0 )
|
||||
// cannot merge when any of the incoming nodes are sources
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
|
||||
// actually build the read threading graph
|
||||
rtgraph.buildGraphIfNecessary();
|
||||
if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.0.raw_readthreading_graph.dot"), pruneFactor);
|
||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
|
||||
|
||||
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
|
||||
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
|
||||
|
|
@ -128,7 +128,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
// remove all heading and trailing paths
|
||||
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
|
||||
|
||||
if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"), pruneFactor);
|
||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
|
||||
|
||||
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
|
||||
|
||||
|
|
@ -136,7 +136,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph);
|
||||
|
||||
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
|
||||
if ( debugGraphTransformations ) initialSeqGraph.printGraph(new File("sequenceGraph.0.2.initial_seqgraph.dot"), pruneFactor);
|
||||
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
|
||||
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
|
||||
|
||||
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
|
||||
|
|
|
|||
|
|
@ -166,4 +166,20 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
|
|||
splitter.merge(data.graph, data.v);
|
||||
assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoesntMergeSourceNodes() {
|
||||
final SeqGraph g = new SeqGraph();
|
||||
final SeqVertex v1 = new SeqVertex("A");
|
||||
final SeqVertex v2 = new SeqVertex("A");
|
||||
final SeqVertex v3 = new SeqVertex("A");
|
||||
final SeqVertex top = new SeqVertex("T");
|
||||
final SeqVertex b = new SeqVertex("C");
|
||||
g.addVertices(top, v1, v2, v3, top, b);
|
||||
g.addEdges(top, v1, b);
|
||||
g.addEdges(v2, b); // v2 doesn't have previous node, cannot be merged
|
||||
g.addEdges(top, v3, b);
|
||||
final SharedSequenceMerger merger = new SharedSequenceMerger();
|
||||
Assert.assertFalse(merger.merge(g, b), "Shouldn't be able to merge shared vertices, when one is a source");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue