From 965043472a2ea1b8c0a63f9441e83fe36960a451 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Mar 2013 08:30:49 -0400 Subject: [PATCH 01/13] Vastly more powerful, cleaner graph simplification approach -- Generalizes previous node merging and splitting approaches. Can split common prefixes and suffices among nodes, build a subgraph representing this new structure, and incorporate it into the original graph. Introduces the concept of edges with 0 multiplicity (for purely structural reasons) as well as vertices with no sequence (again, for structural reasons). Fully UnitTested. These new algorithms can now really simplify diamond configurations as well as ones sources and sinks that arrive / depart linearly at a common single root node. -- This new suite of algorithms is fully integrated into the HC, replacing previous approaches -- SeqGraph transformations are applied iteratively (zipping, splitting, merging) until no operations can be performed on the graph. This further simplifies the graphs, as splitting nodes may enable other merging / zip operations to go. --- .../walkers/haplotypecaller/BaseEdge.java | 32 +- .../walkers/haplotypecaller/BaseGraph.java | 84 +++- .../walkers/haplotypecaller/BaseVertex.java | 13 +- .../haplotypecaller/DeBruijnAssembler.java | 10 +- .../gatk/walkers/haplotypecaller/Path.java | 20 + .../walkers/haplotypecaller/SeqGraph.java | 365 +++++++++--------- .../walkers/haplotypecaller/SeqVertex.java | 15 + .../SharedVertexSequenceSplitter.java | 341 ++++++++++++++++ .../haplotypecaller/BaseVertexUnitTest.java | 5 +- .../haplotypecaller/SeqGraphUnitTest.java | 34 +- .../SharedVertexSequenceSplitterUnitTest.java | 253 ++++++++++++ 11 files changed, 966 insertions(+), 206 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java index d49b63672..07a6629d7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import java.io.Serializable; +import java.util.Collection; import java.util.Comparator; /** @@ -151,10 +152,39 @@ public class BaseEdge { * multiplicity is the sum * * @param edge the edge to add + * @return this */ - public void add(final BaseEdge edge) { + public BaseEdge add(final BaseEdge edge) { if ( edge == null ) throw new IllegalArgumentException("edge cannot be null"); this.multiplicity += edge.getMultiplicity(); this.isRef = this.isRef || edge.isRef(); + return this; + } + + /** + * Create a new BaseEdge with multiplicity and isRef that's an or of all edges + * + * @param edges a collection of edges to or their isRef values + * @param multiplicity our desired multiplicity + * @return a newly allocated BaseEdge + */ + public static BaseEdge orRef(final Collection edges, final int multiplicity) { + for ( final BaseEdge e : edges ) + if ( e.isRef() ) + return new BaseEdge(true, multiplicity); + return new BaseEdge(false, multiplicity); + } + + /** + * Return a new edge that the max of this and edge + * + * isRef is simply the or of this and edge + * multiplicity is the max + * + * @param edge the edge to max + */ + public BaseEdge max(final BaseEdge edge) { + if ( edge == null ) throw new IllegalArgumentException("edge cannot be null"); + return new BaseEdge(isRef() || edge.isRef(), Math.max(getMultiplicity(), edge.getMultiplicity())); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index c77ec4222..c3f371ec7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -48,10 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; @@ -219,6 +221,15 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph vertices) { + for ( final T v : vertices ) + addVertex(v); + } + /** * Convenience function to add multiple edges to the graph * @param start the first vertex to connect @@ -426,6 +446,23 @@ public class BaseGraph extends DefaultDirectedGraph verticesToRemove = new HashSet(vertexSet()); + final DepthFirstIterator dfi = new DepthFirstIterator(this, getReferenceSourceVertex()); + while ( dfi.hasNext() ) { + final T accessibleFromRefSource = dfi.next(); + // we also want to prune all sinks that aren't the reference sink + if ( ! isNonRefSink(accessibleFromRefSource) ) + verticesToRemove.remove(accessibleFromRefSource); + } + + removeAllVertices(verticesToRemove); + } + protected void pruneGraph( final int pruneFactor ) { final List edgesToRemove = new ArrayList(); for( final BaseEdge e : edgeSet() ) { @@ -525,7 +562,52 @@ public class BaseGraph extends DefaultDirectedGraph edges = getAllEdges(source, target); + return getSingletonEdge(getAllEdges(source, target)); + } + + /** + * Get the incoming edge of v. Requires that there be only one such edge or throws an error + * @param v our vertex + * @return the single incoming edge to v, or null if none exists + */ + public BaseEdge incomingEdgeOf(final T v) { + return getSingletonEdge(incomingEdgesOf(v)); + } + + /** + * Get the outgoing edge of v. Requires that there be only one such edge or throws an error + * @param v our vertex + * @return the single outgoing edge from v, or null if none exists + */ + public BaseEdge outgoingEdgeOf(final T v) { + return getSingletonEdge(outgoingEdgesOf(v)); + } + + /** + * Helper function that gets the a single edge from edges, null if edges is empty, or + * throws an error is edges has more than 1 element + * @param edges a set of edges + * @return a edge + */ + @Requires("edges != null") + private BaseEdge getSingletonEdge(final Collection edges) { + if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges); return edges.isEmpty() ? null : edges.iterator().next(); } + + /** + * Add edge between source -> target if none exists, or add e to an already existing one if present + * + * @param source source vertex + * @param target vertex + * @param e edge to add + */ + public void addOrUpdateEdge(final T source, final T target, final BaseEdge e) { + final BaseEdge prev = getEdge(source, target); + if ( prev != null ) { + prev.add(e); + } else { + addEdge(source, target, e); + } + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java index 93bd4f5c5..a6436f0a3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java @@ -69,10 +69,21 @@ public class BaseVertex { */ public BaseVertex(final byte[] sequence) { if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); - if ( sequence.length == 0 ) throw new IllegalArgumentException("Sequence cannot be empty"); this.sequence = sequence; } + /** + * Does this vertex have an empty sequence? + * + * That is, is it a dummy node that's only present for structural reasons but doesn't actually + * contribute to the sequence of the graph? + * + * @return true if sequence is empty, false otherwise + */ + public boolean isEmpty() { + return length() == 0; + } + /** * Get the length of this sequence * @return a positive integer >= 1 diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 7cf4cc8d3..6aec9c7a5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -173,7 +173,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); graph = graph.errorCorrect(); if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); - graph.cleanNonRefPaths(); final SeqGraph seqGraph = toSeqGraph(graph); @@ -199,6 +198,14 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); seqGraph.simplifyGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR); + + // if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef + // might blow up because there's no reference source node + if ( seqGraph.vertexSet().size() == 1 ) + return seqGraph; + seqGraph.removePathsNotConnectedToRef(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), PRUNE_FACTOR); + return seqGraph; } @@ -274,6 +281,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } + graph.cleanNonRefPaths(); return graph; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java index 7546155a6..4adfe6612 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java @@ -391,4 +391,24 @@ class Path { cigar = initialCigar; } } + + /** + * Tests that this and other have the same score and vertices in the same order with the same seq + * @param other the other path to consider. Cannot be null + * @return true if this and path are equal, false otherwise + */ + public boolean equalScoreAndSequence(final Path other) { + if ( other == null ) throw new IllegalArgumentException("other cannot be null"); + + if ( getScore() != other.getScore() ) + return false; + final List mine = getVertices(); + final List yours = other.getVertices(); + if ( mine.size() == yours.size() ) { // hehehe + for ( int i = 0; i < mine.size(); i++ ) + if ( ! mine.get(i).seqEquals(yours.get(i)) ) + return false; + } + return true; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java index b855390c6..da24a06a4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java @@ -46,13 +46,9 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; -import org.apache.commons.lang.StringUtils; -import java.io.File; -import java.util.*; +import java.util.Set; /** * A graph that contains base sequence at each node @@ -61,6 +57,8 @@ import java.util.*; * @since 03/2013 */ public class SeqGraph extends BaseGraph { + private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5; + /** * Construct an empty SeqGraph */ @@ -86,18 +84,38 @@ public class SeqGraph extends BaseGraph { * in any way the sequences implied by a complex enumeration of all paths through the graph. */ public void simplifyGraph() { + simplifyGraph(Integer.MAX_VALUE); + } + + protected void simplifyGraph(final int maxCycles) { + boolean didSomeWork; + int i = 0; + + // start off with one round of zipping of chains for performance reasons zipLinearChains(); - mergeBranchingNodes(); - zipLinearChains(); + do { + //logger.info("simplifyGraph iteration " + i); + // iterate until we haven't don't anything useful + didSomeWork = false; + //printGraph(new File("simplifyGraph." + i + ".dot"), 0); + didSomeWork |= new MergeDiamonds().transformUntilComplete(); + didSomeWork |= new MergeTails().transformUntilComplete(); + didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); + didSomeWork |= zipLinearChains(); + i++; + } while (didSomeWork && i < maxCycles); } /** * Zip up all of the simple linear chains present in this graph. */ - protected void zipLinearChains() { + protected boolean zipLinearChains() { + boolean foundOne = false; while( zipOneLinearChain() ) { // just keep going until zipOneLinearChain says its done + foundOne = true; } + return foundOne; } /** @@ -147,193 +165,168 @@ public class SeqGraph extends BaseGraph { } /** - * Perform as many branch simplifications and merging operations as possible on this graph, - * modifying it in place. + * Base class for transformation operations that need to iterate over proposed vertices, where + * each proposed vertex is a seed vertex for a potential transformation. + * + * transformUntilComplete will iteratively apply the tryToTransform function on each vertex in the graph + * until no vertex can be found that can be transformed. + * + * Note that in order to eventually terminate tryToTransform must transform the graph such that eventually + * no vertices are candidates for further transformations. */ - protected void mergeBranchingNodes() { - boolean foundNodesToMerge = true; - while( foundNodesToMerge ) { - foundNodesToMerge = false; + private abstract class VertexBasedTransformer { + /** + * For testing purposes we sometimes want to test that can be transformed capabilities are working + * without actually modifying the graph */ + private boolean dontModifyGraphEvenIfPossible = false; - for( final SeqVertex v : vertexSet() ) { - foundNodesToMerge = simplifyDiamondIfPossible(v); - if ( foundNodesToMerge ) - break; - } - } - } + public boolean dontModifyGraphEvenIfPossible() { return dontModifyGraphEvenIfPossible; } + public void setDontModifyGraphEvenIfPossible() { this.dontModifyGraphEvenIfPossible = true; } - /** - * A simple structure that looks like: - * - * v - * / | \ \ - * m1 m2 m3 ... mn - * \ | / / - * b - * - * Only returns true if all outgoing edges of v go to vertices that all only connect to - * a single bottom node, and that all middle nodes have only the single edge - * - * @param v the vertex to test if its the top of a diamond pattern - * @return true if v is the root of a diamond - */ - protected boolean isRootOfDiamond(final SeqVertex v) { - final Set ve = outgoingEdgesOf(v); - if ( ve.size() <= 1 ) - return false; + /** + * Merge until the graph has no vertices that are candidates for merging + */ + public boolean transformUntilComplete() { + boolean didAtLeastOneTranform = false; + boolean foundNodesToMerge = true; + while( foundNodesToMerge ) { + foundNodesToMerge = false; - SeqVertex bottom = null; - for ( final BaseEdge e : ve ) { - final SeqVertex mi = getEdgeTarget(e); - - // all nodes must have at least 1 connection - if ( outDegreeOf(mi) < 1 ) - return false; - - // can only have 1 incoming node, the root vertex - if ( inDegreeOf(mi) != 1 ) - return false; - - // make sure that all outgoing vertices of mi go only to the bottom node - for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { - if ( bottom == null ) - bottom = mt; - else if ( ! bottom.equals(mt) ) - return false; - } - } - - // bottom has some connections coming in from other nodes, don't allow - if ( inDegreeOf(bottom) != ve.size() ) - return false; - - return true; - } - - /** - * Return the longest suffix of bases shared among all provided vertices - * - * For example, if the vertices have sequences AC, CC, and ATC, this would return - * a single C. However, for ACC and TCC this would return CC. And for AC and TG this - * would return null; - * - * @param middleVertices a non-empty set of vertices - * @return - */ - @Requires("!middleVertices.isEmpty()") - private byte[] commonSuffixOfEdgeTargets(final Set middleVertices) { - final String[] kmers = new String[middleVertices.size()]; - - int i = 0; - for ( final SeqVertex v : middleVertices ) { - kmers[i++] = (StringUtils.reverse(v.getSequenceString())); - } - - final String commonPrefix = StringUtils.getCommonPrefix(kmers); - return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes(); - } - - /** - * Get the node that is the bottom of a diamond configuration in the graph starting at top - * - * @param top - * @return - */ - @Requires("top != null") - @Ensures({"result != null"}) - private SeqVertex getDiamondBottom(final SeqVertex top) { - final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next(); - final SeqVertex middle = getEdgeTarget(topEdge); - final BaseEdge middleEdge = outgoingEdgesOf(middle).iterator().next(); - return getEdgeTarget(middleEdge); - } - - /** - * Get the set of vertices that are in the middle of a diamond starting at top - * @param top - * @return - */ - @Requires("top != null") - @Ensures({"result != null", "!result.isEmpty()"}) - final Set getMiddleVertices(final SeqVertex top) { - final Set middles = new HashSet(); - for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) { - middles.add(getEdgeTarget(topToMiddle)); - } - return middles; - } - - /** - * Simply a diamond configuration in the current graph starting at top, if possible - * - * If top is actually the top of a diamond that can be simplified (i.e., doesn't have any - * random edges or other structure that would cause problems with the transformation), then this code - * performs the following transformation on this graph (modifying it): - * - * A -> M1 -> B, A -> M2 -> B, A -> Mn -> B - * - * becomes - * - * A -> M1' -> B', A -> M2' -> B', A -> Mn' -> B' - * - * where B' is composed of the longest common suffix of all Mi nodes + B, and Mi' are each - * middle vertex without their shared suffix. - * - * @param top a proposed vertex in this graph that might start a diamond (but doesn't have to) - * @return true top actually starts a diamond and it could be simplified - */ - private boolean simplifyDiamondIfPossible(final SeqVertex top) { - if ( ! isRootOfDiamond(top) ) - return false; - - final SeqVertex diamondBottom = getDiamondBottom(top); - final Set middleVertices = getMiddleVertices(top); - final List verticesToRemove = new LinkedList(); - final List edgesToRemove = new LinkedList(); - - // all of the edges point to the same sink, so it's time to merge - final byte[] commonSuffix = commonSuffixOfEdgeTargets(middleVertices); - if ( commonSuffix != null ) { - final BaseEdge botToNewBottom = new BaseEdge(false, 0); - final BaseEdge elimMiddleNodeEdge = new BaseEdge(false, 0); - final SeqVertex newBottomV = new SeqVertex(commonSuffix); - addVertex(newBottomV); - - for ( final SeqVertex middle : middleVertices ) { - final SeqVertex withoutSuffix = middle.withoutSuffix(commonSuffix); - final BaseEdge topToMiddleEdge = getEdge(top, middle); - final BaseEdge middleToBottomE = getEdge(middle, diamondBottom); - - // clip out the two edges, since we'll be replacing them later - edgesToRemove.add(topToMiddleEdge); - edgesToRemove.add(middleToBottomE); - - if ( withoutSuffix != null ) { // this node is a deletion - addVertex(withoutSuffix); - // update edge from top -> middle to be top -> without suffix - addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge)); - addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE)); - } else { - // this middle node is == the common suffix, wo we're removing the edge - elimMiddleNodeEdge.add(topToMiddleEdge); + for( final SeqVertex v : vertexSet() ) { + foundNodesToMerge = tryToTransform(v); + if ( foundNodesToMerge ) { + didAtLeastOneTranform = true; + break; + } } - // include the ref and multi of mid -> bot in our edge from new bot -> bot - botToNewBottom.add(middleToBottomE); - verticesToRemove.add(middle); } - // add an edge from top to new bottom, because some middle nodes were removed - if ( elimMiddleNodeEdge.getMultiplicity() > 0 ) - addEdge(top, newBottomV, elimMiddleNodeEdge); + return didAtLeastOneTranform; + } - addEdge(newBottomV, diamondBottom, botToNewBottom); + /** + * Merge, if possible, seeded on the vertex v + * @param v the proposed seed vertex to merge + * @return true if some useful merging happened, false otherwise + */ + abstract boolean tryToTransform(final SeqVertex v); + } - removeAllEdges(edgesToRemove); - removeAllVertices(verticesToRemove); - return true; - } else { - return false; + /** + * Merge diamond configurations: + * + * Performance the transformation: + * + * { A -> x + S_i + y -> Z } + * + * goes to: + * + * { A -> x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + */ + protected class MergeDiamonds extends VertexBasedTransformer { + @Override + protected boolean tryToTransform(final SeqVertex top) { + final Set middles = outgoingVerticesOf(top); + if ( middles.size() <= 1 ) + // we can only merge if there's at least two middle nodes + return false; + + SeqVertex bottom = null; + for ( final SeqVertex mi : middles ) { + // all nodes must have at least 1 connection + if ( outDegreeOf(mi) < 1 ) + return false; + + // can only have 1 incoming node, the root vertex + if ( inDegreeOf(mi) != 1 ) + return false; + + // make sure that all outgoing vertices of mi go only to the bottom node + for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { + if ( bottom == null ) + bottom = mt; + else if ( ! bottom.equals(mt) ) + return false; + } + } + + // bottom has some connections coming in from other nodes, don't allow + if ( inDegreeOf(bottom) != middles.size() ) + return false; + + if ( dontModifyGraphEvenIfPossible() ) return true; + + // actually do the merging, returning true if at least 1 base was successfully split + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, middles); + return splitter.splitAndUpdate(top, bottom, 1); + } + } + + /** + * Merge tail configurations: + * + * Performs the transformation: + * + * { A -> x + S_i + y } + * + * goes to: + * + * { A -> x -> S_i -> y } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no bottom node is required + */ + protected class MergeTails extends VertexBasedTransformer { + @Override + protected boolean tryToTransform(final SeqVertex top) { + final Set tails = outgoingVerticesOf(top); + if ( tails.size() <= 1 ) + return false; + + for ( final SeqVertex t : tails ) + if ( ! isSink(t) || inDegreeOf(t) > 1 ) + return false; + + if ( dontModifyGraphEvenIfPossible() ) return true; + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, tails); + return splitter.splitAndUpdate(top, null, MIN_SUFFIX_TO_MERGE_TAILS); + } + } + + /** + * Merge headless configurations: + * + * Performs the transformation: + * + * { x + S_i + y -> Z } + * + * goes to: + * + * { x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no top node is required + */ + protected class MergeHeadlessIncomingSources extends VertexBasedTransformer { + @Override + boolean tryToTransform(final SeqVertex bottom) { + final Set incoming = incomingVerticesOf(bottom); + if ( incoming.size() <= 1 ) + return false; + + for ( final SeqVertex inc : incoming ) + if ( ! isSource(inc) || outDegreeOf(inc) > 1 ) + return false; + + if ( dontModifyGraphEvenIfPossible() ) return true; + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming); + return splitter.splitAndUpdate(null, bottom, 1); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java index b45ac0c34..523312dcf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java @@ -150,4 +150,19 @@ public class SeqVertex extends BaseVertex { final int prefixSize = sequence.length - suffix.length; return prefixSize > 0 ? new SeqVertex(Arrays.copyOf(sequence, prefixSize)) : null; } + + /** + * Return a new SeqVertex derived from this one but not including prefix or suffix bases + * + * @param prefix the previx bases to remove + * @param suffix the suffix bases to remove from this vertex + * @return a newly allocated SeqVertex + */ + @Requires("Utils.endsWith(sequence, suffix)") + public SeqVertex withoutPrefixAndSuffix(final byte[] prefix, final byte[] suffix) { + final int start = prefix.length; + final int length = sequence.length - suffix.length - prefix.length; + final int stop = start + length; + return length > 0 ? new SeqVertex(Arrays.copyOfRange(sequence, start, stop)) : null; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java new file mode 100644 index 000000000..e0501da52 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java @@ -0,0 +1,341 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.*; + +/** + * Split a collection of middle nodes in a graph into their shared prefix and suffix values + * + * This code performs the following transformation. Suppose I have a set of vertices V, such + * that each vertex is composed of sequence such that + * + * Vi = prefix + seq_i + suffix + * + * where prefix and suffix are shared sequences across all vertices V + * + * This algorithm creates a new SeqGraph with the following configuration + * + * prefix -> has outgoing edges to all seq_i + * suffix -> has incoming edges for all seq_i + * + * There are a few special cases that must be handled. First, Vi could be simply + * == to the prefix or the suffix. These generate direct connections between + * the prefix and suffix nodes, and they are handled internally by the algorithm. + * + * Note that for convenience, we will always create newTop and newBottom nodes, but + * these may be empty node (i.e., they contain no sequence). That allows them to be + * trivially merged, if desired, when the graph is incorporated into an overall + * graph. + * + * The product of this operation is a SeqGraph that contains the split. There's a + * function to merge reconnect this graph into the graph that contains the middle nodes + * + * The process guarentees a few things about the output: + * + * -- Preserves the paths and weights among all vertices + * + * It produces a graph that has some unusual properties + * + * -- May add nodes with no sequence (isEmpty() == true) to preserve connectivity among the graph + * -- May introduce edges with no multiplicity to preserve paths through the graph + * + * The overall workflow of using this class is simple: + * + * find vertices V in graph that you want to split out + * s = new SharedVertexSequenceSplitter(graph, V) + * s.updateGraph(graph) + * + * to update the graph with the modifications created by this splitter + * + * User: depristo + * Date: 3/22/13 + * Time: 8:31 AM + */ +public class SharedVertexSequenceSplitter { + final private SeqGraph outer; + final protected SeqVertex prefixV, suffixV; + final protected Collection toSplits; + + // updated in split routine + protected SeqGraph splitGraph = null; + protected Collection newMiddles = null; + protected List edgesToRemove = null; + + /** + * Create a new graph that contains the vertices in toSplitsArg with their shared suffix and prefix + * sequences extracted out. + * + * @param graph the graph containing the vertices in toSplitsArg + * @param toSplitsArg a collection of vertices to split. Must be contained within graph, and have only connections + * from a single shared top and/or bottom node + */ + public SharedVertexSequenceSplitter(final SeqGraph graph, final Collection toSplitsArg) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( toSplitsArg == null ) throw new IllegalArgumentException("toSplitsArg cannot be null"); + if ( toSplitsArg.size() < 2 ) throw new IllegalArgumentException("Can only split at least 2 vertices but only got " + toSplitsArg); + if ( ! graph.vertexSet().containsAll(toSplitsArg) ) throw new IllegalArgumentException("graph doesn't contain all of the vertices to split"); + + this.outer = graph; + this.toSplits = toSplitsArg; + + // all of the edges point to the same sink, so it's time to merge + final Pair prefixAndSuffix = commonPrefixAndSuffixOfVertices(toSplits); + prefixV = prefixAndSuffix.getFirst(); + suffixV = prefixAndSuffix.getSecond(); + } + + /** + * Simple single-function interface to split and then update a graph + * + * @see #updateGraph(SeqVertex, SeqVertex) for a full description of top and bottom + * + * @param top the top vertex, may be null + * @param bottom the bottom vertex, may be null + * @param minCommonSequence the minimum prefix or suffix size necessary among the vertices to split up + * before we'll go ahead and actually do the splitting. Allows one to determine + * whether there's actually any useful splitting to do, as well as protect + * yourself against spurious splitting of nodes based on trivial amounts of overall + * @return true if some useful splitting was done, false otherwise + */ + public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom, final int minCommonSequence) { + if ( prefixV.length() < minCommonSequence && suffixV.length() < minCommonSequence ) + return false; + split(); + updateGraph(top, bottom); + return true; + } + + /** + * Actually do the splitting up of the vertices + * + * Must be called before calling updateGraph + */ + public void split() { + splitGraph = new SeqGraph(); + newMiddles = new LinkedList(); + edgesToRemove = new LinkedList(); + + splitGraph.addVertices(prefixV, suffixV); + + for ( final SeqVertex mid : toSplits ) { + final BaseEdge toMid = processEdgeToRemove(mid, outer.incomingEdgeOf(mid)); + final BaseEdge fromMid = processEdgeToRemove(mid, outer.outgoingEdgeOf(mid)); + + final SeqVertex remaining = mid.withoutPrefixAndSuffix(prefixV.getSequence(), suffixV.getSequence()); + if ( remaining != null ) { + // there's some sequence prefix + seq + suffix, so add the node and make edges + splitGraph.addVertex(remaining); + newMiddles.add(remaining); + // update edge from top -> middle to be top -> without suffix + splitGraph.addEdge(prefixV, remaining, toMid); + splitGraph.addEdge(remaining, suffixV, fromMid); + } else { + // prefix + suffix completely explain this node + splitGraph.addOrUpdateEdge(prefixV, suffixV, new BaseEdge(toMid).add(fromMid)); + } + } + } + + /** + * Update graph outer, replacing the previous middle vertices that were split out with the new + * graph structure of the split, linking this subgraph into the graph at top and bot (the + * vertex connecting the middle nodes and the vertex outgoing of all middle node) + * + * @param top an optional top node that must have outgoing edges to all split vertices. If null, this subgraph + * will be added without any incoming edges + * @param bot an optional bottom node that must have incoming edges to all split vertices. If null, this subgraph + * will be added without any outgoing edges to the rest of the graph + */ + public void updateGraph(final SeqVertex top, final SeqVertex bot) { + if ( ! outer.vertexSet().containsAll(toSplits) ) throw new IllegalArgumentException("graph doesn't contain all of the original vertices to split"); + if ( top == null && bot == null ) throw new IllegalArgumentException("Cannot update graph without at least one top or bot vertex, but both were null"); + if ( top != null && ! outer.containsVertex(top) ) throw new IllegalArgumentException("top " + top + " not found in graph " + outer); + if ( bot != null && ! outer.containsVertex(bot) ) throw new IllegalArgumentException("bot " + bot + " not found in graph " + outer); + if ( splitGraph == null ) throw new IllegalStateException("Cannot call updateGraph until split() has been called"); + + outer.removeAllVertices(toSplits); + outer.removeAllEdges(edgesToRemove); + + outer.addVertices(newMiddles); + + final boolean hasPrefixSuffixEdge = splitGraph.getEdge(prefixV, suffixV) != null; + final boolean hasOnlyPrefixSuffixEdges = hasPrefixSuffixEdge && splitGraph.outDegreeOf(prefixV) == 1; + final boolean needPrefixNode = ! prefixV.isEmpty() || (top == null && ! hasOnlyPrefixSuffixEdges); + final boolean needSuffixNode = ! suffixV.isEmpty() || (bot == null && ! hasOnlyPrefixSuffixEdges); + + // if prefix / suffix are needed, keep them + final SeqVertex topForConnect = needPrefixNode ? prefixV : top; + final SeqVertex botForConnect = needSuffixNode ? suffixV : bot; + + if ( needPrefixNode ) { + outer.addVertex(prefixV); + if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0)); + } + + if ( needSuffixNode ) { + outer.addVertex(suffixV); + if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0)); + } + + if ( topForConnect != null ) { + for ( final BaseEdge e : splitGraph.outgoingEdgesOf(prefixV) ) { + final SeqVertex target = splitGraph.getEdgeTarget(e); + + if ( target == suffixV ) { // going straight from prefix -> suffix + if ( botForConnect != null ) + outer.addEdge(topForConnect, botForConnect, e); + } else { + outer.addEdge(topForConnect, target, e); + } + } + } + + if ( botForConnect != null ) { + for ( final BaseEdge e : splitGraph.incomingEdgesOf(suffixV) ) { + outer.addEdge(splitGraph.getEdgeSource(e), botForConnect, e); + } + } + } + + /** + * Return the longest suffix of bases shared among all provided vertices + * + * For example, if the vertices have sequences AC, CC, and ATC, this would return + * a single C. However, for ACC and TCC this would return CC. And for AC and TG this + * would return null; + * + * @param middleVertices a non-empty set of vertices + * @return + */ + @Requires("!middleVertices.isEmpty()") + protected static Pair commonPrefixAndSuffixOfVertices(final Collection middleVertices) { + final List kmers = new ArrayList(middleVertices.size()); + + int min = Integer.MAX_VALUE; + for ( final SeqVertex v : middleVertices ) { + kmers.add(v.getSequence()); + min = Math.min(min, v.getSequence().length); + } + + final int prefixLen = compPrefixLen(kmers, min); + final int suffixLen = compSuffixLen(kmers, min - prefixLen); + + final byte[] kmer = kmers.get(0); + final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen); + final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length); + return new Pair(new SeqVertex(prefix), new SeqVertex(suffix)); + } + + /** + * Compute the maximum shared prefix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the start of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compPrefixLen(final List listOfBytes, final int minLength) { + for ( int i = 0; i < minLength; i++ ) { + final byte b = listOfBytes.get(0)[i]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[i] ) + return i; + } + } + + return minLength; + } + + /** + * Compute the maximum shared suffix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the end of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compSuffixLen(final List listOfBytes, final int minLength) { + for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) { + final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] ) + return suffixLen; + } + } + return minLength; + } + + /** + * Helper function that returns an edge that we should use for splitting + * + * If e is null, creates a new 0 multiplicity edge, set to ref is any edges to V are ref + * If e is not null, returns a new copy of e, and schedules e for removal + * + * @param e a non-null edge + * @return a non-null edge + */ + @Requires("v != null") + @Ensures("result != null") + private BaseEdge processEdgeToRemove(final SeqVertex v, final BaseEdge e) { + if ( e == null ) { + // there's no edge, so we return a newly allocated one and don't schedule e for removal + // the weight must be 0 to preserve sum through the diamond + return new BaseEdge(outer.isReferenceNode(v), 0); + } else { + // schedule edge for removal, and return a freshly allocated one for our graph to use + edgesToRemove.add(e); + return new BaseEdge(e); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java index cd27c7183..8f682d474 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java @@ -68,9 +68,10 @@ public class BaseVertexUnitTest extends BaseTest { new BaseVertex((byte[])null); } - @Test(expectedExceptions = IllegalArgumentException.class) + @Test() public void testCreationEmptySeq() { - new BaseVertex(new byte[0]); + final BaseVertex v = new BaseVertex(new byte[0]); + Assert.assertTrue(v.isEmpty(), "Version with length == 0 should be empty"); } @Test diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java index 83a4f4c50..6b6826e45 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java @@ -51,7 +51,6 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -181,7 +180,9 @@ public class SeqGraphUnitTest extends BaseTest { @Test(dataProvider = "IsDiamondData", enabled = true) public void testIsDiamond(final SeqGraph graph, final SeqVertex v, final boolean isRootOfDiamond) { - Assert.assertEquals(graph.isRootOfDiamond(v), isRootOfDiamond); + final SeqGraph.MergeDiamonds merger = graph.new MergeDiamonds(); + merger.setDontModifyGraphEvenIfPossible(); + Assert.assertEquals(merger.tryToTransform(v), isRootOfDiamond); } @DataProvider(name = "MergingData") @@ -267,7 +268,7 @@ public class SeqGraphUnitTest extends BaseTest { tests.add(new Object[]{graph.clone(), expected.clone()}); } - { + { // all the nodes -> lots of merging and motion of nodes final SeqGraph all = new SeqGraph(); all.addVertices(pre1, pre2, top, middle1, middle2, bottom, tail1, tail2); all.addEdges(pre1, top, middle1, bottom, tail1); @@ -277,9 +278,13 @@ public class SeqGraphUnitTest extends BaseTest { final SeqVertex newMiddle1 = new SeqVertex("G"); final SeqVertex newMiddle2 = new SeqVertex("T"); final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString()); - expected.addVertices(pre1, pre2, top, newMiddle1, newMiddle2, newBottom, tail1, tail2); - expected.addEdges(pre1, top, newMiddle1, newBottom, tail1); - expected.addEdges(pre2, top, newMiddle2, newBottom, tail2); + final SeqVertex newTop = new SeqVertex("A"); + final SeqVertex newTopDown1 = new SeqVertex("G"); + final SeqVertex newTopDown2 = new SeqVertex("C"); + final SeqVertex newTopBottomMerged = new SeqVertex("TA"); + expected.addVertices(newTop, newTopDown1, newTopDown2, newTopBottomMerged, newMiddle1, newMiddle2, newBottom, tail1, tail2); + expected.addEdges(newTop, newTopDown1, newTopBottomMerged, newMiddle1, newBottom, tail1); + expected.addEdges(newTop, newTopDown2, newTopBottomMerged, newMiddle2, newBottom, tail2); tests.add(new Object[]{all.clone(), expected.clone()}); } @@ -309,7 +314,12 @@ public class SeqGraphUnitTest extends BaseTest { @Test(dataProvider = "MergingData", enabled = true) public void testMerging(final SeqGraph graph, final SeqGraph expected) { final SeqGraph merged = (SeqGraph)graph.clone(); - merged.simplifyGraph(); + merged.simplifyGraph(1); +// if ( ! SeqGraph.graphEquals(merged, expected) ) { +// graph.printGraph(new File("graph.dot"), 0); +// merged.printGraph(new File("merged.dot"), 0); +// expected.printGraph(new File("expected.dot"), 0); +// } Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); } @@ -332,13 +342,9 @@ public class SeqGraphUnitTest extends BaseTest { graph.addEdge(mid1, bot, new BaseEdge(true, 1)); final SeqGraph expected = new SeqGraph(); - expected.addVertices(top, mid1, bot); - expected.addEdge(top, mid1, new BaseEdge(true, 2)); - expected.addEdge(mid1, bot, new BaseEdge(true, 2)); - + expected.addVertex(new SeqVertex("AACTC")); final SeqGraph actual = ((SeqGraph)graph.clone()); - actual.mergeBranchingNodes(); - - Assert.assertTrue(BaseGraph.graphEquals(actual, expected)); + actual.simplifyGraph(); + Assert.assertTrue(BaseGraph.graphEquals(actual, expected), "Wrong merging result after complete merging"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java new file mode 100644 index 000000000..52ab36064 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java @@ -0,0 +1,253 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class SharedVertexSequenceSplitterUnitTest extends BaseTest { + private final static boolean PRINT_GRAPHS = false; + + @DataProvider(name = "PrefixSuffixData") + public Object[][] makePrefixSuffixData() { + List tests = new ArrayList(); + + tests.add(new Object[]{Arrays.asList("A", "C"), 0, 0}); + tests.add(new Object[]{Arrays.asList("C", "C"), 1, 0}); + tests.add(new Object[]{Arrays.asList("ACT", "AGT"), 1, 1}); + tests.add(new Object[]{Arrays.asList("ACCT", "AGT"), 1, 1}); + tests.add(new Object[]{Arrays.asList("ACT", "ACT"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACT"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACTG"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACTGA"), 3, 1}); + tests.add(new Object[]{Arrays.asList("GCTGA", "ACTGA"), 0, 4}); + + tests.add(new Object[]{Arrays.asList("A", "C", "A"), 0, 0}); + tests.add(new Object[]{Arrays.asList("A", "A", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("A", "AA", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("A", "ACA", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACT"), 2, 1}); + tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACGT"), 2, 1}); + tests.add(new Object[]{Arrays.asList("AAAT", "AAA", "CAAA"), 0, 0}); + tests.add(new Object[]{Arrays.asList("AACTTT", "AAGTTT", "AAGCTTT"), 2, 3}); + tests.add(new Object[]{Arrays.asList("AAA", "AAA", "CAAA"), 0, 3}); + tests.add(new Object[]{Arrays.asList("AAA", "AAA", "AAA"), 3, 0}); + + tests.add(new Object[]{Arrays.asList("AC", "ACA", "AC"), 2, 0}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testPrefixSuffix(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final List bytes = new ArrayList(); + int min = Integer.MAX_VALUE; + for ( final String s : strings ) { + bytes.add(s.getBytes()); + min = Math.min(min, s.length()); + } + + final int actualPrefixLen = SharedVertexSequenceSplitter.compPrefixLen(bytes, min); + Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test"); + + final int actualSuffixLen = SharedVertexSequenceSplitter.compSuffixLen(bytes, min - actualPrefixLen); + Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test"); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testPrefixSuffixVertices(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final List v = new ArrayList(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + + final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); + final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); + + final Pair result = SharedVertexSequenceSplitter.commonPrefixAndSuffixOfVertices(v); + Assert.assertEquals(result.getFirst().getSequenceString(), expectedPrefix, "Failed suffix test"); + Assert.assertEquals(result.getSecond().getSequenceString(), expectedSuffix, "Failed suffix test"); + + Assert.assertEquals(result.getFirst().isEmpty(), expectedPrefix.isEmpty()); + Assert.assertEquals(result.getSecond().isEmpty(), expectedSuffix.isEmpty()); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testSplitter(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final SeqGraph graph = new SeqGraph(); + + final List v = new ArrayList(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + + graph.addVertices(v.toArray(new SeqVertex[]{})); + + final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); + final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + splitter.split(); +// splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".dot"), 0); + + Assert.assertEquals(splitter.prefixV.getSequenceString(), expectedPrefix); + Assert.assertEquals(splitter.suffixV.getSequenceString(), expectedSuffix); + + Assert.assertTrue(splitter.splitGraph.outDegreeOf(splitter.prefixV) <= strings.size()); + Assert.assertEquals(splitter.splitGraph.inDegreeOf(splitter.prefixV), 0); + + Assert.assertTrue(splitter.splitGraph.inDegreeOf(splitter.suffixV) <= strings.size()); + Assert.assertEquals(splitter.splitGraph.outDegreeOf(splitter.suffixV), 0); + + for ( final SeqVertex mid : splitter.newMiddles ) { + Assert.assertNotNull(splitter.splitGraph.getEdge(splitter.prefixV, mid)); + Assert.assertNotNull(splitter.splitGraph.getEdge(mid, splitter.suffixV)); + } + } + + @DataProvider(name = "CompleteCycleData") + public Object[][] makeCompleteCycleData() { + List tests = new ArrayList(); + + for ( final boolean hasTop : Arrays.asList(true, false) ) { + for ( final boolean hasBot : Arrays.asList(true, false) ) { + if ( ! hasTop && ! hasBot ) continue; + tests.add(new Object[]{Arrays.asList("A", "A"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "AC"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "CA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AC", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AT", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATAA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATAACA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAA", "AAA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "AAA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "CCCAAA"), hasTop, hasBot}); + + tests.add(new Object[]{Arrays.asList("A", "A", "A"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "A", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "C", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AC", "C", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CA", "C", "C"), hasTop, hasBot}); + // all merged + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGA"), hasTop, hasBot}); + // prefix and suffix + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "ACA"), hasTop, hasBot}); + // 2 -> prefix, leave C + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGAC"), hasTop, hasBot}); + // 2 -> prefix, leave CCC + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGACCC"), hasTop, hasBot}); + // 2 -> suffix, leave A/T + tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AAGA"), hasTop, hasBot}); + // 2 -> suffix, leave T, delete 1 + tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AGA"), hasTop, hasBot}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CompleteCycleData") + public void testSplitterCompleteCycle(final List strings, final boolean hasTop, final boolean hasBot) { + final SeqGraph graph = new SeqGraph(); + + int edgeWeight = 1; + final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null; + final SeqVertex bot = hasBot ? new SeqVertex("GGGGGGGG") : null; + final List v = new ArrayList(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + graph.addVertices(v.toArray(new SeqVertex[]{})); + final SeqVertex first = v.get(0); + + if ( hasTop ) { + graph.addVertex(top); + for ( final SeqVertex vi : v ) + graph.addEdge(top, vi, new BaseEdge(vi == first, edgeWeight++)); + } + + if ( hasBot ) { + graph.addVertex(bot); + for ( final SeqVertex vi : v ) + graph.addEdge(vi, bot, new BaseEdge(vi == first, edgeWeight++)); + } + + final Set haplotypes = new HashSet(); + final List> originalPaths = new KBestPaths().getKBestPaths((SeqGraph)graph.clone()); + for ( final Path path : originalPaths ) + haplotypes.add(new String(path.getBases())); + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + splitter.split(); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0); + if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0); + splitter.updateGraph(top, bot); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0); + + final List> splitPaths = new KBestPaths().getKBestPaths(graph); + for ( final Path path : splitPaths ) { + final String h = new String(path.getBases()); + Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); + } + + if ( splitPaths.size() == originalPaths.size() ) { + for ( int i = 0; i < originalPaths.size(); i++ ) { + Assert.assertTrue(splitPaths.get(i).equalScoreAndSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); + } + } + } +} From 464e65ea96161bcd6a213c16f22c8622dea0fc3a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 23 Mar 2013 14:10:54 -0400 Subject: [PATCH 02/13] Disable error correcting kmers by default in the HC -- The error correction algorithm can break the reference graph in some cases by error correcting us into a bad state for the reference sequence. Because we know that the error correction algorithm isn't ideal, and worse, doesn't actually seem to improve the calling itself on chr20, I've simply disabled error correction by default and allowed it to be turned on with a hidden argument. -- In the process I've changed a bit the assembly interface, moving some common arguments us into the LocalAssemblyEngine, which are turned on/off via setter methods. -- Went through the updated arguments in the HC to be @Hidden and @Advanced as appropriate -- Don't write out an errorcorrected graph when debugging and error correction isn't enabled --- .../haplotypecaller/DeBruijnAssembler.java | 44 +++++++------------ .../haplotypecaller/HaplotypeCaller.java | 25 +++++++++-- .../haplotypecaller/LocalAssemblyEngine.java | 42 ++++++++++++++++-- 3 files changed, 77 insertions(+), 34 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 6aec9c7a5..c219fab00 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -65,7 +65,6 @@ import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.io.PrintStream; import java.util.*; /** @@ -83,7 +82,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25; - public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; private static final int GRAPH_KMER_STEP = 6; // Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode @@ -94,30 +92,23 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private final boolean debug; private final boolean debugGraphTransformations; - private final PrintStream graphWriter; private final int minKmer; - private final byte minBaseQualityToUseInAssembly; private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; - private int PRUNE_FACTOR = 2; protected DeBruijnAssembler() { - this(false, -1, null, 11, DEFAULT_MIN_BASE_QUALITY_TO_USE); + this(false, -1, 11); } public DeBruijnAssembler(final boolean debug, final int debugGraphTransformations, - final PrintStream graphWriter, - final int minKmer, - final byte minBaseQualityToUseInAssembly) { + final int minKmer) { super(); this.debug = debug; this.debugGraphTransformations = debugGraphTransformations > 0; this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; - this.graphWriter = graphWriter; this.minKmer = minKmer; - this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; } /** @@ -126,19 +117,15 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { * @param refHaplotype reference haplotype object * @param fullReferenceWithPadding byte array holding the reference sequence with padding * @param refLoc GenomeLoc object corresponding to the reference sequence with padding - * @param PRUNE_FACTOR prune kmers from the graph if their weight is <= this value * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode * @return a non-empty list of all the haplotypes that are produced during assembly */ @Ensures({"result.contains(refHaplotype)"}) - public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final List activeAllelesToGenotype ) { + public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype ) { if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); } if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); } - if( PRUNE_FACTOR < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } - - // set the pruning factor for this run of the assembly engine - this.PRUNE_FACTOR = PRUNE_FACTOR; + if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } // create the graphs final List graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); @@ -170,13 +157,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready - if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); - graph = graph.errorCorrect(); - if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); + + if ( shouldErrorCorrectKmers() ) { + graph = graph.errorCorrect(); + if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor); + } final SeqGraph seqGraph = toSeqGraph(graph); - if( seqGraph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference + if ( seqGraph != null ) { // if the graph contains interesting variation from the reference sanityCheckReferenceGraph(seqGraph, refHaplotype); graphs.add(seqGraph); @@ -192,19 +182,19 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) { final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR); - seqGraph.pruneGraph(PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + seqGraph.pruneGraph(pruneFactor); seqGraph.removeVerticesNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), pruneFactor); seqGraph.simplifyGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), pruneFactor); // if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef // might blow up because there's no reference source node if ( seqGraph.vertexSet().size() == 1 ) return seqGraph; seqGraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), pruneFactor); return seqGraph; } @@ -295,7 +285,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { continue; } - graph.printGraph(graphWriter, false, PRUNE_FACTOR); + graph.printGraph(graphWriter, false, pruneFactor); if ( debugGraphTransformations ) break; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 81ff3dfbd..5849b5a0e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -171,6 +171,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png * */ + @Advanced @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) protected StingSAMFileWriter bamWriter = null; private HaplotypeBAMWriter haplotypeBAMWriter; @@ -178,12 +179,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem /** * The type of BAM output we want to see. */ + @Advanced @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. */ + @Advanced @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; @@ -191,6 +194,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; + @Advanced @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; @@ -217,6 +221,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) protected boolean includeUnmappedReads = false; + @Advanced @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) protected boolean USE_ALLELES_TRIGGER = false; @@ -232,6 +237,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) protected boolean dontGenotype = false; + @Hidden + @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectKmers = false; + /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. * dbSNP is not used in any way for the calculations themselves. @@ -246,6 +255,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * Records that are filtered in the comp track will be ignored. * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). */ + @Advanced @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) public List> comps = Collections.emptyList(); public List> getCompRodBindings() { return comps; } @@ -258,6 +268,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem /** * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. */ + @Advanced @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) protected List annotationsToUse = new ArrayList(Arrays.asList(new String[]{"ClippingRankSumTest"})); @@ -265,6 +276,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, * so annotations will be excluded even if they are explicitly included with the other options. */ + @Advanced @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); @@ -277,12 +289,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @ArgumentCollection private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); + @Advanced @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; + @Advanced @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) protected int debugGraphTransformations = -1; + @Hidden @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) protected boolean useLowQualityBasesForAssembly = false; @@ -388,8 +403,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE; - assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, minBaseQualityToUseInAssembly ); + // setup the assembler + assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, minKmer); + assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); + if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); + if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); @@ -520,7 +539,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 3efa342b1..c31405872 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -51,6 +51,7 @@ import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.variant.variantcontext.VariantContext; +import java.io.PrintStream; import java.util.List; /** @@ -59,13 +60,46 @@ import java.util.List; * Date: Mar 14, 2011 */ public abstract class LocalAssemblyEngine { + public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; - public enum ASSEMBLER { - SIMPLE_DE_BRUIJN + protected PrintStream graphWriter = null; + protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE; + protected int pruneFactor = 2; + protected boolean errorCorrectKmers = false; + + protected LocalAssemblyEngine() { } + + public int getPruneFactor() { + return pruneFactor; } - protected LocalAssemblyEngine() { + public void setPruneFactor(int pruneFactor) { + this.pruneFactor = pruneFactor; } - public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, List activeAllelesToGenotype); + public boolean shouldErrorCorrectKmers() { + return errorCorrectKmers; + } + + public void setErrorCorrectKmers(boolean errorCorrectKmers) { + this.errorCorrectKmers = errorCorrectKmers; + } + + public PrintStream getGraphWriter() { + return graphWriter; + } + + public void setGraphWriter(PrintStream graphWriter) { + this.graphWriter = graphWriter; + } + + public byte getMinBaseQualityToUseInAssembly() { + return minBaseQualityToUseInAssembly; + } + + public void setMinBaseQualityToUseInAssembly(byte minBaseQualityToUseInAssembly) { + this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; + } + + public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List activeAllelesToGenotype); } From 1917d55dc228f450cabd669b1038e8dce861584f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Mar 2013 22:58:25 -0400 Subject: [PATCH 03/13] Bugfix for DeBruijnAssembler: don't fail when read length > haplotype length -- The previous version would generate graphs that had no reference bases at all in the situation where the reference haplotype was < the longer read length, which would cause the kmer size to exceed the reference haplotype length. Now return immediately with a null graph when this occurs as opposed to continuing and eventually causing an error --- .../gatk/walkers/haplotypecaller/DeBruijnAssembler.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index c219fab00..6343d79ef 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -216,11 +216,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) protected DeBruijnGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnGraph graph = new DeBruijnGraph(KMER_LENGTH); // First pull kmers from the reference haplotype and add them to the graph - //logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString()); final byte[] refSequence = refHaplotype.getBases(); if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; @@ -232,12 +230,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return null; } } + } else { + // not enough reference sequence to build a kmer graph of this length, return null + return null; } // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { - //if ( ! read.getReadName().equals("H06JUADXX130110:1:1213:15422:11590")) continue; - //logger.info("Adding read " + read + " with sequence " + read.getReadString()); final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced From 2472828e1c464f43d59beca9c7fe26ad4a533a2b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Mar 2013 23:42:44 -0400 Subject: [PATCH 04/13] HC bug fixes: no longer create reference graphs with cycles -- Though not intended, it was possible to create reference graphs with cycles in the case where you started the graph with a homopolymer of length > the kmer. The previous test would fail to catch this case. Now its not possible -- Lots of code cleanup and refactoring in this push. Split the monolithic createGraphFromSequences into simple calls to addReferenceKmersToGraph and addReadKmersToGraph which themselves share lower level functions like addKmerPairFromSeqToGraph. -- Fix performance problem with reduced reads and the HC, where we were calling add kmer pair for each count in the reduced read, instead of just calling it once with a multiplicity of count. -- Refactor addKmersToGraph() to use things like addOrUpdateEdge, now the code is very clear --- .../haplotypecaller/DeBruijnAssembler.java | 108 ++++++++++++------ .../haplotypecaller/DeBruijnGraph.java | 39 ++++--- .../DeBruijnAssemblerUnitTest.java | 4 +- 3 files changed, 99 insertions(+), 52 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 6343d79ef..9c6c255df 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -154,7 +154,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { continue; if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); - DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); + DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); @@ -189,10 +189,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { seqGraph.simplifyGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), pruneFactor); - // if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef - // might blow up because there's no reference source node - if ( seqGraph.vertexSet().size() == 1 ) - return seqGraph; + // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can + // happen in cases where for example the reference somehow manages to acquire a cycle, or + // where the entire assembly collapses back into the reference sequence. + if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null ) + return null; + seqGraph.removePathsNotConnectedToRef(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), pruneFactor); @@ -214,64 +216,102 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) - protected DeBruijnGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnGraph graph = new DeBruijnGraph(KMER_LENGTH); + @Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"}) + protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype ) { + final DeBruijnGraph graph = new DeBruijnGraph(kmerLength); // First pull kmers from the reference haplotype and add them to the graph - final byte[] refSequence = refHaplotype.getBases(); - if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { - final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true, 1) ) { - if( DEBUG ) { - System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping"); - } - return null; - } - } - } else { - // not enough reference sequence to build a kmer graph of this length, return null + if ( ! addReferenceKmersToGraph(graph, refHaplotype.getBases()) ) + // something went wrong, so abort right now with a null graph return null; - } + + // now go through the graph already seeded with the reference sequence and add the read kmers to it + if ( ! addReadKmersToGraph(graph, reads) ) + // some problem was detected adding the reads to the graph, return null to indicate we failed + return null; + + graph.cleanNonRefPaths(); + return graph; + } + + /** + * Add the high-quality kmers from the reads to the graph + * + * @param graph a graph to add the read kmers to + * @param reads a non-null list of reads whose kmers we want to add to the graph + * @return true if we successfully added the read kmers to the graph without corrupting it in some way + */ + protected boolean addReadKmersToGraph(final DeBruijnGraph graph, final List reads) { + final int kmerLength = graph.getKmerSize(); // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced - if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { - final int kmersInSequence = sequence.length - KMER_LENGTH + 1; + if( sequence.length > kmerLength + KMER_OVERLAP ) { + final int kmersInSequence = sequence.length - kmerLength + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { // if the qualities of all the bases in the kmers are high enough boolean badKmer = false; - for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) { + for( int jjj = iii; jjj < iii + kmerLength + 1; jjj++) { if( qualities[jjj] < minBaseQualityToUseInAssembly ) { badKmer = true; break; } } if( !badKmer ) { + // how many observations of this kmer have we seen? A normal read counts for 1, but + // a reduced read might imply a higher multiplicity for our the edge int countNumber = 1; if( read.isReducedRead() ) { // compute mean number of reduced read counts in current kmer span // precise rounding can make a difference with low consensus counts - countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + KMER_LENGTH)); + // TODO -- optimization: should extend arrayMax function to take start stop values + countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + kmerLength)); } - final byte[] kmer1 = Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH); - final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH); - - for( int kkk=0; kkk < countNumber; kkk++ ) { - graph.addKmersToGraph(kmer1, kmer2, false, 1); - } + graph.addKmerPairFromSeqToGraph(sequence, iii, false, countNumber); } } } } - graph.cleanNonRefPaths(); - return graph; + // always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way + return true; + } + + /** + * Add the kmers from the reference sequence to the DeBruijnGraph + * + * @param graph the graph to add the reference kmers to. Must be empty + * @param refSequence the reference sequence from which we'll get our kmers + * @return true if we succeeded in creating a good graph from the reference sequence, false otherwise + */ + protected boolean addReferenceKmersToGraph(final DeBruijnGraph graph, final byte[] refSequence) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( graph.vertexSet().size() != 0 ) throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + graph.vertexSet().size() + " vertices in it already: " + graph); + if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null"); + + + final int kmerLength = graph.getKmerSize(); + if( refSequence.length < kmerLength + KMER_OVERLAP ) { + // not enough reference sequence to build a kmer graph of this length, return null + return false; + } + + final int kmersInSequence = refSequence.length - kmerLength + 1; + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + graph.addKmerPairFromSeqToGraph(refSequence, iii, true, 1); + } + + // we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices + if ( graph.vertexSet().size() != kmersInSequence ) { + if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping"); + return false; + } + + return true; } protected void printGraphs(final List graphs) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java index 0e20c311b..fd8581254 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java @@ -126,30 +126,37 @@ public class DeBruijnGraph extends BaseGraph { * @param kmer1 the source kmer for the edge * @param kmer2 the target kmer for the edge * @param isRef true if the added edge is a reference edge - * @return will return false if trying to add a reference edge which creates a cycle in the assembly graph */ - public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) { + public void addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) { if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); } - final int numVertexBefore = vertexSet().size(); final DeBruijnVertex v1 = new DeBruijnVertex( kmer1 ); - addVertex(v1); final DeBruijnVertex v2 = new DeBruijnVertex( kmer2 ); - addVertex(v2); - if( isRef && vertexSet().size() == numVertexBefore ) { return false; } + final BaseEdge toAdd = new BaseEdge(isRef, multiplicity); - final BaseEdge targetEdge = getEdge(v1, v2); - if ( targetEdge == null ) { - addEdge(v1, v2, new BaseEdge( isRef, multiplicity )); - } else { - if( isRef ) { - targetEdge.setIsRef( true ); - } - targetEdge.setMultiplicity(targetEdge.getMultiplicity() + multiplicity); - } - return true; + addVertices(v1, v2); + addOrUpdateEdge(v1, v2, toAdd); + } + + /** + * Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this + * graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The + * edge between them is added with isRef and multiplicity + * + * @param sequence a sequence of bases from which we want to extract a pair of kmers + * @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize() + * @param isRef should the edge between the two kmers be a reference edge? + * @param multiplicity what's the multiplicity of the edge between these two kmers + */ + public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final boolean isRef, final int multiplicity ) { + if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); + if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start); + if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length); + final byte[] kmer1 = Arrays.copyOfRange(sequence, start, start + getKmerSize()); + final byte[] kmer2 = Arrays.copyOfRange(sequence, start + 1, start + 1 + getKmerSize()); + addKmersToGraph(kmer1, kmer2, isRef, multiplicity); } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index 663d619a8..cce623b76 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -72,8 +72,8 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { public void testReferenceCycleGraph() { String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC"; String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC"; - final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false); - final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false); + final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true)); + final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true)); Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation."); Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation."); From ad04fdb23313ae9439887d829b236fde18c87c84 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 24 Mar 2013 14:41:21 -0400 Subject: [PATCH 05/13] PerReadAlleleLikelihoodMap getMostLikelyAllele returns an MostLikelyAllele objects now -- This new functionality allows the client to make decisions about how to handle non-informative reads, rather than having a single enforced constant that isn't really appropriate for all users. The previous functionality is maintained now and used by all of the updated pieces of code, except the BAM writers, which now emit reads to display to their best allele, regardless of whether this is particularly informative or not. That way you can see all of your data realigned to the new HC structure, rather than just those that are specifically informative. -- This all makes me concerned that the informative thresholding isn't appropriately used in the annotations themselves. There are many cases where nearby variation makes specific reads non-informative about one event, due to not being informative about the second. For example, suppose you have two SNPs A/B and C/D that are in the same active region but separated by more than the read length of the reads. All reads would be non-informative as no read provides information about the full combination of 4 haplotypes, as they reads only span a single event. In this case our annotations will all fall apart, returning their default values. Added a JIRA to address this (should be discussed in group meeting) --- .../annotator/BaseQualityRankSumTest.java | 13 +- .../annotator/ClippingRankSumTest.java | 9 +- .../annotator/DepthPerAlleleBySample.java | 9 +- .../gatk/walkers/annotator/FisherStrand.java | 5 +- .../annotator/MappingQualityRankSumTest.java | 9 +- .../walkers/annotator/ReadPosRankSumTest.java | 9 +- .../genotyper/MostLikelyAlleleUnitTest.java | 114 ++++++++++++++++ .../utils/genotyper/MostLikelyAllele.java | 127 ++++++++++++++++++ .../genotyper/PerReadAlleleLikelihoodMap.java | 30 ++--- .../AllHaplotypeBAMWriter.java | 52 +++---- .../CalledHaplotypeBAMWriter.java | 52 +++---- 11 files changed, 333 insertions(+), 96 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 04f9e87c7..a3a9e50e9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -90,13 +91,13 @@ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnot } for (Map el : alleleLikelihoodMap.getLikelihoodMapValues()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el); + if (! a.isInformative()) continue; // read is non-informative - if (a.isReference()) - refQuals.add(-10.0*(double)el.get(a)); - else if (allAlleles.contains(a)) - altQuals.add(-10.0*(double)el.get(a)); + if (a.getMostLikelyAllele().isReference()) + refQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele())); + else if (allAlleles.contains(a.getMostLikelyAllele())) + altQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele())); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index 90ca5c667..366512119 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -84,12 +85,12 @@ public class ClippingRankSumTest extends RankSumTest { for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (! a.isInformative()) continue; // read is non-informative - if (a.isReference()) + if (a.getMostLikelyAllele().isReference()) refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); - else if (allAlleles.contains(a)) + else if (allAlleles.contains(a.getMostLikelyAllele())) altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 9f90a1308..1cf91f181 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; @@ -141,12 +142,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa } for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final GATKSAMRecord read = el.getKey(); - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (! a.isInformative() ) continue; // read is non-informative - if (!vc.getAlleles().contains(a)) + if (!vc.getAlleles().contains(a.getMostLikelyAllele())) continue; // sanity check - shouldn't be needed - alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); + alleleCounts.put(a.getMostLikelyAllele(), alleleCounts.get(a.getMostLikelyAllele()) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); } final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 7960a3ce2..e4dcaec54 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -273,10 +274,10 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final Allele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - updateTable(table, mostLikelyAllele, read, ref, alt, representativeCount); + updateTable(table, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index b30df04a8..3873138a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -92,13 +93,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn return; } for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); // BUGBUG: There needs to be a comparable isUsableBase check here - if (a.isNoCall()) + if (! a.isInformative()) continue; // read is non-informative - if (a.isReference()) + if (a.getMostLikelyAllele().isReference()) refQuals.add((double)el.getKey().getMappingQuality()); - else if (allAlleles.contains(a)) + else if (allAlleles.contains(a.getMostLikelyAllele())) altQuals.add((double)el.getKey().getMappingQuality()); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 182a9226f..6ce4aab49 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -51,6 +51,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -107,8 +108,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio } for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (! a.isInformative() ) continue; // read is non-informative final GATKSAMRecord read = el.getKey(); @@ -123,9 +124,9 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio if (readPos > numAlignedBases / 2) readPos = numAlignedBases - (readPos + 1); - if (a.isReference()) + if (a.getMostLikelyAllele().isReference()) refQuals.add((double)readPos); - else if (allAlleles.contains(a)) + else if (allAlleles.contains(a.getMostLikelyAllele())) altQuals.add((double)readPos); } } diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java new file mode 100644 index 000000000..cf077392b --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java @@ -0,0 +1,114 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.genotyper; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.variant.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class MostLikelyAlleleUnitTest extends BaseTest { + final Allele a = Allele.create("A"); + + @Test + public void testBasicCreation() { + final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1; + MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); + Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second); + + Assert.assertEquals(mla.isInformative(), true); + Assert.assertEquals(mla.isInformative(10), false); + Assert.assertEquals(mla.isInformative(0), true); + Assert.assertEquals(mla.getAlleleIfInformative(), a); + Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(0), a); + } + + @Test + public void testNotDefaultInformative() { + final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2); + MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + Assert.assertEquals(mla.isInformative(), false); + Assert.assertEquals(mla.isInformative(10), false); + Assert.assertEquals(mla.isInformative(0), true); + Assert.assertEquals(mla.getAlleleIfInformative(), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(0), a); + } + + @Test + public void testCreationNoGoodSecond() { + MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, Double.NEGATIVE_INFINITY); + Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); + Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); + + Assert.assertEquals(mla.isInformative(), true); + Assert.assertEquals(mla.isInformative(10), true); + Assert.assertEquals(mla.isInformative(0), true); + Assert.assertEquals(mla.getAlleleIfInformative(), a); + Assert.assertEquals(mla.getAlleleIfInformative(10), a); + Assert.assertEquals(mla.getAlleleIfInformative(0), a); + } + + @Test + public void testCreationNoAllele() { + MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); + Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL); + Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY); + Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); + + Assert.assertEquals(mla.isInformative(), false); + Assert.assertEquals(mla.isInformative(10), false); + Assert.assertEquals(mla.isInformative(0), false); + Assert.assertEquals(mla.getAlleleIfInformative(), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(0), Allele.NO_CALL); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java new file mode 100644 index 000000000..e12fb546c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java @@ -0,0 +1,127 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.genotyper; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.Allele; + +/** + * Stores the most likely and second most likely alleles, along with a threshold + * for assuming computing that a read is informative. + * + * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD + * then the most likely allele is set to "no call", and isInformative will return false. This constant can be + * overridden simply by using one of the version of these calls that accepts informative threshold as an argument. + * + * For convenience, there are functions called getAlleleIfInformative that return either the most likely allele, or + * NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD of one another. + * + * By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the + * corresponding key + * + * User: depristo + * Date: 3/24/13 + * Time: 1:39 PM + */ +public final class MostLikelyAllele { + public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2; + + final Allele mostLikely; + final double log10LikelihoodOfMostLikely; + final double log10LikelihoodOfSecondBest; + + /** + * Create a new MostLikelyAllele + * + * If there's a meaningful most likely allele, allele should be a real allele. If none can be determined, + * mostLikely should be a NO_CALL allele. + * + * @param mostLikely the most likely allele + * @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele + * @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available) + */ + public MostLikelyAllele(Allele mostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { + if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null"); + if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) ) + throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely); + if ( log10LikelihoodOfSecondBest != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfSecondBest) ) + throw new IllegalArgumentException("log10LikelihoodOfSecondBest must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfSecondBest); + if ( log10LikelihoodOfMostLikely < log10LikelihoodOfSecondBest ) + throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest); + + this.mostLikely = mostLikely; + this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely; + this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest; + } + + public Allele getMostLikelyAllele() { + return mostLikely; + } + + public double getLog10LikelihoodOfMostLikely() { + return log10LikelihoodOfMostLikely; + } + + public double getLog10LikelihoodOfSecondBest() { + return log10LikelihoodOfSecondBest; + } + + /** + * @see #isInformative(double) with threshold of INFORMATIVE_LIKELIHOOD_THRESHOLD + */ + public boolean isInformative() { + return isInformative(INFORMATIVE_LIKELIHOOD_THRESHOLD); + } + + /** + * Was this allele selected from an object that was specifically informative about the allele? + * + * The calculation that implements this is whether the likelihood of the most likely allele is larger + * than the second most likely by at least the log10ThresholdForInformative + * + * @return true if so, false if not + */ + public boolean isInformative(final double log10ThresholdForInformative) { + return getLog10LikelihoodOfMostLikely() - getLog10LikelihoodOfSecondBest() > log10ThresholdForInformative; + } + + /** + * @see #getAlleleIfInformative(double) with threshold of INFORMATIVE_LIKELIHOOD_THRESHOLD + */ + public Allele getAlleleIfInformative() { + return getAlleleIfInformative(INFORMATIVE_LIKELIHOOD_THRESHOLD); + } + + /** + * Get the most likely allele if isInformative(log10ThresholdForInformative) is true, or NO_CALL otherwise + * + * @param log10ThresholdForInformative a log10 threshold to determine if the most likely allele was informative + * @return a non-null allele + */ + public Allele getAlleleIfInformative(final double log10ThresholdForInformative) { + return isInformative(log10ThresholdForInformative) ? getMostLikelyAllele() : Allele.NO_CALL; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 5e010db67..02618100d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -41,10 +41,6 @@ import java.util.*; * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. */ public class PerReadAlleleLikelihoodMap { - - - public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2; - protected List alleles; protected Map> likelihoodReadMap; @@ -119,9 +115,9 @@ public class PerReadAlleleLikelihoodMap { for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { // do not remove reduced reads! if ( !entry.getKey().isReducedRead() ) { - final Allele bestAllele = getMostLikelyAllele(entry.getValue()); - if ( bestAllele != Allele.NO_CALL ) - alleleReadMap.get(bestAllele).add(entry.getKey()); + final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); + if ( bestAllele.isInformative() ) + alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey()); } } @@ -194,32 +190,25 @@ public class PerReadAlleleLikelihoodMap { /** * Given a map from alleles to likelihoods, find the allele with the largest likelihood. - * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD - * then the most likely allele is set to "no call" + * * @param alleleMap - a map from alleles to likelihoods - * @return - the most likely allele, or NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD - * of one another. By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the - * corresponding key + * @return - a MostLikelyAllele object */ @Ensures("result != null") - public static Allele getMostLikelyAllele( final Map alleleMap ) { + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap ) { return getMostLikelyAllele(alleleMap, null); } /** * Given a map from alleles to likelihoods, find the allele with the largest likelihood. - * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD - * then the most likely allele is set to "no call" * * @param alleleMap - a map from alleles to likelihoods * @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best. * this is useful for the case where you've selected a subset of the alleles that * the reads have been computed for further analysis. If null totally ignored - * @return - the most likely allele, or NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD - * of one another. By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the - * corresponding key + * @return - a MostLikelyAllele object */ - public static Allele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null"); double maxLike = Double.NEGATIVE_INFINITY; double prevMaxLike = Double.NEGATIVE_INFINITY; @@ -237,7 +226,8 @@ public class PerReadAlleleLikelihoodMap { prevMaxLike = el.getValue(); } } - return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL ); + + return new MostLikelyAllele(mostLikelyAllele, maxLike, prevMaxLike); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java index 46ffd43b6..f6fa44ac5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java @@ -1,27 +1,27 @@ /* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.utils.haplotypeBAMWriter; @@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.variant.variantcontext.Allele; @@ -71,9 +72,8 @@ class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { // next, output the interesting reads for each sample aligned against the appropriate haplotype for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); - if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); + final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart()); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java index a33ed809a..aae00c3ea 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -1,33 +1,34 @@ /* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.utils.haplotypeBAMWriter; import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.variant.variantcontext.Allele; @@ -77,9 +78,8 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { if ( entry.getKey().getMappingQuality() > 0 ) { - final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); - if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); + final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart()); } } } From 12475cc0276b27d4cf890779b96716c2f7113754 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 24 Mar 2013 14:42:48 -0400 Subject: [PATCH 06/13] Display the active MappingQualityFilter if mmq > 0 in the HaplotypeCaller --- .../HCMappingQualityFilter.java | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java index 3892ffe27..21b66986a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java @@ -1,32 +1,34 @@ /* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.ReadFilter; /** @@ -35,9 +37,17 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter; * @author mdepristo */ public class HCMappingQualityFilter extends ReadFilter { + private final static Logger logger = Logger.getLogger(HCMappingQualityFilter.class); + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for analysis with the HaplotypeCaller", required = false) public int MIN_MAPPING_QUALTY_SCORE = 20; + @Override + public void initialize(GenomeAnalysisEngine engine) { + if ( MIN_MAPPING_QUALTY_SCORE > 0 ) + logger.info("Filtering out reads with MAPQ < " + MIN_MAPPING_QUALTY_SCORE); + } + public boolean filterOut(SAMRecord rec) { return (rec.getMappingQuality() < MIN_MAPPING_QUALTY_SCORE); } From 78c672676b25b46a6a302703a7e77afb3f0ff996 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 24 Mar 2013 17:20:30 -0400 Subject: [PATCH 07/13] Bugfix for pruning and removing non-reference edges in graph -- Previous algorithms were applying pruneGraph inappropriately on the raw sequence graph (where each vertex is a single base). This results in overpruning of the graph, as prunegraph really relied on the zipping of linear chains (and the sharing of weight this provides) to avoid over-pruning the graph. Probably we should think hard about this. This commit fixes this logic, so we zip the graph between pruning -- In this process ID's a fundamental problem with how we were trimming away vertices that occur on a path from the reference source to sink. In fact, we were leaving in any vertex that happened to be accessible from source, any vertices in cycles, and any vertex that wasn't the absolute end of a chain going to a sink. The new algorithm fixes all of this, using a BaseGraphIterator that's a general approach to walking the base graph. Other routines that use the same traversal idiom refactored to use this iterator. Added unit tests for all of these capabilities. -- Created new BaseGraphIterator, which abstracts common access patterns to graph, and use this where appropriate --- .../walkers/haplotypecaller/BaseGraph.java | 128 ++++++++++-------- .../haplotypecaller/BaseGraphIterator.java | 120 ++++++++++++++++ .../haplotypecaller/DeBruijnAssembler.java | 15 +- .../haplotypecaller/BaseGraphUnitTest.java | 86 ++++++++++-- 4 files changed, 279 insertions(+), 70 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphIterator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index c3f371ec7..80e5148db 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -336,9 +336,18 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph PRUNE_FACTOR ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); -// } if( edge.isRef() ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); } - //if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } for( final T v : vertexSet() ) { @@ -436,6 +441,30 @@ public class BaseGraph extends DefaultDirectedGraph edgesToRemove = new ArrayList(); + for( final BaseEdge e : edgeSet() ) { + if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor + edgesToRemove.add(e); + } + } + removeAllEdges(edgesToRemove); + + removeSingletonOrphanVertices(); + } + + /** + * Remove all vertices in the graph that have in and out degree of 0 + */ + protected void removeSingletonOrphanVertices() { // Run through the graph and clean up singular orphaned nodes final List verticesToRemove = new LinkedList(); for( final T v : vertexSet() ) { @@ -446,65 +475,54 @@ public class BaseGraph extends DefaultDirectedGraph toRemove = new HashSet(vertexSet()); + + final T refV = getReferenceSourceVertex(); + if ( refV != null ) { + for ( final T v : new BaseGraphIterator(this, refV, true, true) ) { + toRemove.remove(v); + } + } + + removeAllVertices(toRemove); + } + + /** + * Remove all vertices in the graph that aren't on a path from the reference source vertex to the reference sink vertex + * + * More aggressive reference pruning algorithm than removeVerticesNotConnectedToRefRegardlessOfEdgeDirection, + * as it requires vertices to not only be connected by a series of directed edges but also prunes away + * paths that do not also meet eventually with the reference sink vertex + */ protected void removePathsNotConnectedToRef() { if ( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) { throw new IllegalStateException("Graph must have ref source and sink vertices"); } + // get the set of vertices we can reach by going forward from the ref source + final Set onPathFromRefSource = new HashSet(vertexSet().size()); + for ( final T v : new BaseGraphIterator(this, getReferenceSourceVertex(), false, true) ) { + onPathFromRefSource.add(v); + } + + // get the set of vertices we can reach by going backward from the ref sink + final Set onPathFromRefSink = new HashSet(vertexSet().size()); + for ( final T v : new BaseGraphIterator(this, getReferenceSinkVertex(), true, false) ) { + onPathFromRefSink.add(v); + } + + // we want to remove anything that's not in both the sink and source sets final Set verticesToRemove = new HashSet(vertexSet()); - final DepthFirstIterator dfi = new DepthFirstIterator(this, getReferenceSourceVertex()); - while ( dfi.hasNext() ) { - final T accessibleFromRefSource = dfi.next(); - // we also want to prune all sinks that aren't the reference sink - if ( ! isNonRefSink(accessibleFromRefSource) ) - verticesToRemove.remove(accessibleFromRefSource); - } - + onPathFromRefSource.retainAll(onPathFromRefSink); + verticesToRemove.removeAll(onPathFromRefSource); removeAllVertices(verticesToRemove); } - protected void pruneGraph( final int pruneFactor ) { - final List edgesToRemove = new ArrayList(); - for( final BaseEdge e : edgeSet() ) { - if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor - edgesToRemove.add(e); - } - } - removeAllEdges(edgesToRemove); - - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new ArrayList(); - for( final T v : vertexSet() ) { - if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { - verticesToRemove.add(v); - } - } - - removeAllVertices(verticesToRemove); - } - - public void removeVerticesNotConnectedToRef() { - final HashSet toRemove = new HashSet(vertexSet()); - final HashSet visited = new HashSet(); - - final LinkedList toVisit = new LinkedList(); - final T refV = getReferenceSourceVertex(); - if ( refV != null ) { - toVisit.add(refV); - while ( ! toVisit.isEmpty() ) { - final T v = toVisit.pop(); - if ( ! visited.contains(v) ) { - toRemove.remove(v); - visited.add(v); - for ( final T prev : incomingVerticesOf(v) ) toVisit.add(prev); - for ( final T next : outgoingVerticesOf(v) ) toVisit.add(next); - } - } - - removeAllVertices(toRemove); - } - } - /** * Semi-lenient comparison of two graphs, truing true if g1 and g2 have similar structure * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphIterator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphIterator.java new file mode 100644 index 000000000..8841f835c --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphIterator.java @@ -0,0 +1,120 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; + +/** + * General iterator that can iterate over all vertices in a BaseGraph, following either + * incoming, outgoing edge (as well as both or none) edges. Supports traversal of graphs + * with cycles and other crazy structures. Will only ever visit each vertex once. The + * order in which the vertices are visited is undefined. + * + * User: depristo + * Date: 3/24/13 + * Time: 4:41 PM + */ +public class BaseGraphIterator implements Iterator, Iterable { + final HashSet visited = new HashSet(); + final LinkedList toVisit = new LinkedList(); + final BaseGraph graph; + final boolean followIncomingEdges, followOutgoingEdges; + + /** + * Create a new BaseGraphIterator starting its traversal at start + * + * Note that if both followIncomingEdges and followOutgoingEdges are false, we simply return the + * start vertex + * + * @param graph the graph to iterator over. Cannot be null + * @param start the vertex to start at. Cannot be null + * @param followIncomingEdges should we follow incoming edges during our + * traversal? (goes backward through the graph) + * @param followOutgoingEdges should we follow outgoing edges during out traversal? + */ + public BaseGraphIterator(final BaseGraph graph, final T start, + final boolean followIncomingEdges, final boolean followOutgoingEdges) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( start == null ) throw new IllegalArgumentException("start cannot be null"); + if ( ! graph.containsVertex(start) ) throw new IllegalArgumentException("start " + start + " must be in graph but it isn't"); + this.graph = graph; + this.followIncomingEdges = followIncomingEdges; + this.followOutgoingEdges = followOutgoingEdges; + + toVisit.add(start); + } + + @Override + public Iterator iterator() { + return this; + } + + @Override + public boolean hasNext() { + return ! toVisit.isEmpty(); + } + + @Override + public T next() { + final T v = toVisit.pop(); + + if ( ! visited.contains(v) ) { + visited.add(v); + if ( followIncomingEdges ) for ( final T prev : graph.incomingVerticesOf(v) ) toVisit.add(prev); + if ( followOutgoingEdges ) for ( final T next : graph.outgoingVerticesOf(v) ) toVisit.add(next); + } + + return v; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Doesn't implement remove"); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 9c6c255df..e9961519c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -183,11 +183,18 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) { final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + + // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive + seqGraph.zipLinearChains(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); + + // now go through and prune the graph, removing vertices no longer connected to the reference chain seqGraph.pruneGraph(pruneFactor); - seqGraph.removeVerticesNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), pruneFactor); + seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); + + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor); seqGraph.simplifyGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), pruneFactor); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor); // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can // happen in cases where for example the reference somehow manages to acquire a cycle, or @@ -196,7 +203,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return null; seqGraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), pruneFactor); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.refcleaned.dot"), pruneFactor); return seqGraph; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java index 463e861b1..db4127ddb 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java @@ -49,22 +49,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeMethod; -import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import scala.actors.threadpool.Arrays; import java.io.File; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; +import java.util.*; -/** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 3/15/13 - * Time: 3:36 PM - * To change this template use File | Settings | File Templates. - */ public class BaseGraphUnitTest extends BaseTest { SeqGraph graph; SeqVertex v1, v2, v3, v4, v5; @@ -105,6 +95,80 @@ public class BaseGraphUnitTest extends BaseTest { assertVertexSetEquals(graph.incomingVerticesOf(v5), v4); } + @Test + public void testRemoveSingletonOrphanVertices() throws Exception { + // all vertices in graph are connected + final List kept = new LinkedList(graph.vertexSet()); + final SeqVertex rm1 = new SeqVertex("CAGT"); + final SeqVertex rm2 = new SeqVertex("AGTC"); + graph.addVertices(rm1, rm2); + Assert.assertEquals(graph.vertexSet().size(), kept.size() + 2); + final BaseEdge rm12e = new BaseEdge(false, 1); + graph.addEdge(rm1, rm2, rm12e); + + final SeqGraph original = (SeqGraph)graph.clone(); + graph.removeSingletonOrphanVertices(); + Assert.assertTrue(BaseGraph.graphEquals(original, graph), "Graph with disconnected component but edges between components shouldn't be modified"); + + graph.removeEdge(rm12e); // now we should be able to remove rm1 and rm2 + graph.removeSingletonOrphanVertices(); + Assert.assertTrue(graph.vertexSet().containsAll(kept)); + Assert.assertFalse(graph.containsVertex(rm1)); + Assert.assertFalse(graph.containsVertex(rm2)); + } + + @Test + public void testRemovePathsNotConnectedToRef() throws Exception { + final SeqGraph graph = new SeqGraph(); + + SeqVertex src = new SeqVertex("A"); + SeqVertex end = new SeqVertex("A"); + SeqVertex g1 = new SeqVertex("C"); + SeqVertex g2 = new SeqVertex("G"); + SeqVertex g3 = new SeqVertex("T"); + SeqVertex g4 = new SeqVertex("AA"); + SeqVertex g5 = new SeqVertex("AA"); + SeqVertex g6 = new SeqVertex("AA"); + SeqVertex g8 = new SeqVertex("AA"); + SeqVertex g7 = new SeqVertex("AA"); + SeqVertex b1 = new SeqVertex("CC"); + SeqVertex b2 = new SeqVertex("GG"); + SeqVertex b3 = new SeqVertex("TT"); + SeqVertex b4 = new SeqVertex("AAA"); + SeqVertex b5 = new SeqVertex("CCC"); + SeqVertex b6 = new SeqVertex("GGG"); + SeqVertex b7 = new SeqVertex("AAAA"); + SeqVertex b8 = new SeqVertex("GGGG"); + SeqVertex b9 = new SeqVertex("CCCC"); + + graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8); + graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end); + graph.addEdges(src, g1, g5, g6, g7, end); + graph.addEdges(src, g1, g5, g8, g7, end); + graph.addEdges(src, g1, g3, end); + + // the current state of the graph is the good one + final SeqGraph good = (SeqGraph)graph.clone(); + + // now add the bads to the graph + graph.addVertices(b1, b2, b3, b4, b5, b6, b7, b8, b9); + graph.addEdges(src, b1); // source -> b1 is dead + graph.addEdges(b6, src); // x -> source is bad + graph.addEdges(g4, b2); // off random vertex is bad + graph.addEdges(g3, b3, b4); // two vertices that don't connect to end are bad + graph.addEdges(end, b5); // vertex off end is bad + graph.addEdges(g3, b7, b8, b7); // cycle is bad + graph.addEdges(g3, b9, b9); // self-cycle is bad + + final boolean debug = true; + if ( debug ) good.printGraph(new File("expected.dot"), 0); + if ( debug ) graph.printGraph(new File("bad.dot"), 0); + graph.removePathsNotConnectedToRef(); + if ( debug ) graph.printGraph(new File("actual.dot"), 0); + + Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); + } + @Test public void testPrintEmptyGraph() throws Exception { final File tmp = File.createTempFile("tmp", "dot"); From a97576384d7953109de2a02c71b49e978f075738 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Mar 2013 18:29:16 -0400 Subject: [PATCH 08/13] Fix bug in the HC not respecting the requested pruning --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 1 + 1 file changed, 1 insertion(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 5849b5a0e..7e2211502 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -406,6 +406,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // setup the assembler assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, minKmer); assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); + assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); From b1b615b668777b63247be224f9ef696b065e01f9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Mar 2013 18:31:23 -0400 Subject: [PATCH 09/13] BaseGraph shouldn't implement getEdge -- no idea why I added this --- .../gatk/walkers/haplotypecaller/BaseGraph.java | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index 80e5148db..9872a370b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -569,20 +569,6 @@ public class BaseGraph extends DefaultDirectedGraph Date: Mon, 25 Mar 2013 18:38:12 -0400 Subject: [PATCH 10/13] Increase max cigar elements from SW before failing path creation to 20 from 6 -- This allows more diversity in paths, which is sometimes necessary when we cannot simply graphs that have large bubbles --- .../sting/gatk/walkers/haplotypecaller/Path.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java index 4adfe6612..269adcc22 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java @@ -68,6 +68,8 @@ import java.util.*; * */ class Path { + private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20; + // the last vertex seen in the path private final T lastVertex; @@ -357,7 +359,7 @@ class Path { } final Cigar swCigar = swConsensus.getCigar(); - if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference + if( swCigar.numCigarElements() > MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW ) { // this bubble is too divergent from the reference returnCigar.add(new CigarElement(1, CigarOperator.N)); } else { for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { @@ -399,9 +401,15 @@ class Path { */ public boolean equalScoreAndSequence(final Path other) { if ( other == null ) throw new IllegalArgumentException("other cannot be null"); + return getScore() == other.getScore() && equalSequence(other); + } - if ( getScore() != other.getScore() ) - return false; + /** + * Tests that this and other have the same vertices in the same order with the same seq + * @param other the other path to consider. Cannot be null + * @return true if this and path are equal, false otherwise + */ + public boolean equalSequence(final Path other) { final List mine = getVertices(); final List yours = other.getVertices(); if ( mine.size() == yours.size() ) { // hehehe From 66910b036c628e858defb8acde4ea26bfec31d0a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Mar 2013 18:37:25 -0400 Subject: [PATCH 11/13] Added new and improved suffix and node merging algorithms -- These new algorithms are more powerful than the restricted diamond merging algoriths, in that they can merge nodes with multiple incoming and outgoing edges. Together the splitter + merger algorithms will correctly merge many more cases than the original headless and tailless diamond merger. -- Refactored haplotype caller infrastructure into graphs package, code cleanup -- Cleanup new merging / splitting algorithms, with proper docs and unit tests -- Fix bug in zipping of linear chains. Because the multiplicity can be 0, protect ourselves with a max function call -- Fix BaseEdge.max unit test -- Add docs and some more unit tests -- Move error correct from DeBruijnGraph to DeBruijnAssembler -- Replaced uses of System.out.println with logger.info -- Don't make multiplicity == 0 nodes look like they should be pruned -- Fix toString of Path --- .../haplotypecaller/DeBruijnAssembler.java | 49 ++++- .../haplotypecaller/GenotypingEngine.java | 20 +- .../haplotypecaller/HaplotypeCaller.java | 4 +- .../LikelihoodCalculationEngine.java | 6 +- .../{ => graphs}/BaseEdge.java | 6 +- .../{ => graphs}/BaseGraph.java | 25 +-- .../{ => graphs}/BaseGraphIterator.java | 2 +- .../{ => graphs}/BaseVertex.java | 2 +- .../graphs/CommonSuffixSplitter.java | 182 ++++++++++++++++++ .../{ => graphs}/DeBruijnGraph.java | 38 +--- .../{ => graphs}/DeBruijnVertex.java | 2 +- .../{ => graphs}/KBestPaths.java | 2 +- .../haplotypecaller/{ => graphs}/Path.java | 7 +- .../{ => graphs}/SeqGraph.java | 78 +++++++- .../{ => graphs}/SeqVertex.java | 3 +- .../graphs/SharedSequenceMerger.java | 138 +++++++++++++ .../SharedVertexSequenceSplitter.java | 47 +---- .../walkers/haplotypecaller/graphs/Utils.java | 138 +++++++++++++ .../DeBruijnAssemblerUnitTest.java | 1 + .../DeBruijnAssemblyGraphUnitTest.java | 1 + .../{ => graphs}/BaseEdgeUnitTest.java | 18 +- .../{ => graphs}/BaseGraphUnitTest.java | 61 +++++- .../{ => graphs}/BaseVertexUnitTest.java | 2 +- .../graphs/CommonSuffixMergerUnitTest.java | 160 +++++++++++++++ .../graphs/CommonSuffixSplitterUnitTest.java | 113 +++++++++++ .../{ => graphs}/DeBruijnVertexUnitTest.java | 2 +- .../{ => graphs}/KBestPathsUnitTest.java | 2 +- .../{ => graphs}/SeqGraphUnitTest.java | 48 ++--- .../{ => graphs}/SeqVertexUnitTest.java | 3 +- .../SharedVertexSequenceSplitterUnitTest.java | 6 +- 30 files changed, 1001 insertions(+), 165 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseEdge.java (98%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseGraph.java (97%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseGraphIterator.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseVertex.java (99%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/DeBruijnGraph.java (88%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/DeBruijnVertex.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/KBestPaths.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/Path.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqGraph.java (86%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqVertex.java (99%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SharedVertexSequenceSplitter.java (91%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseEdgeUnitTest.java (92%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseGraphUnitTest.java (86%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseVertexUnitTest.java (99%) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/DeBruijnVertexUnitTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/KBestPathsUnitTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqGraphUnitTest.java (97%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqVertexUnitTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SharedVertexSequenceSplitterUnitTest.java (98%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index e9961519c..198abeac8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -53,6 +53,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -160,7 +161,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); if ( shouldErrorCorrectKmers() ) { - graph = graph.errorCorrect(); + graph = errorCorrect(graph); if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor); } @@ -189,6 +190,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); // now go through and prune the graph, removing vertices no longer connected to the reference chain + // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight + // edges to maintain graph connectivity. seqGraph.pruneGraph(pruneFactor); seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); @@ -203,7 +206,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return null; seqGraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.refcleaned.dot"), pruneFactor); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); return seqGraph; } @@ -321,6 +324,39 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return true; } + /** + * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers + * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason + * we cannot actually do the error correction + */ + public DeBruijnGraph errorCorrect(final DeBruijnGraph graph) { + final KMerErrorCorrector corrector = new KMerErrorCorrector(graph.getKmerSize(), 1, 1, 5); // TODO -- should be static variables + + for( final BaseEdge e : graph.edgeSet() ) { + for ( final byte[] kmer : Arrays.asList(graph.getEdgeSource(e).getSequence(), graph.getEdgeTarget(e).getSequence())) { + // TODO -- need a cleaner way to deal with the ref weight + corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); + } + } + + if ( corrector.computeErrorCorrectionMap() ) { + final DeBruijnGraph correctedGraph = new DeBruijnGraph(graph.getKmerSize()); + + for( final BaseEdge e : graph.edgeSet() ) { + final byte[] source = corrector.getErrorCorrectedKmer(graph.getEdgeSource(e).getSequence()); + final byte[] target = corrector.getErrorCorrectedKmer(graph.getEdgeTarget(e).getSequence()); + if ( source != null && target != null ) { + correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); + } + } + + return correctedGraph; + } else { + // the error correction wasn't possible, simply return this graph + return graph; + } + } + protected void printGraphs(final List graphs) { final int writeFirstGraphWithSizeSmallerThan = 50; @@ -366,6 +402,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { for( final SeqGraph graph : graphs ) { for ( final Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { +// logger.info("Found path " + path); Haplotype h = new Haplotype( path.getBases() ); if( !returnHaplotypes.contains(h) ) { final Cigar cigar = path.calculateCigar(); @@ -421,13 +458,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if( debug ) { if( returnHaplotypes.size() > 1 ) { - System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); } else { - System.out.println("Found only the reference haplotype in the assembly graph."); + logger.info("Found only the reference haplotype in the assembly graph."); } for( final Haplotype h : returnHaplotypes ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); + logger.info( h.toString() ); + logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index cc9d94b1b..0d6e29fe9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -51,6 +51,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; @@ -66,6 +67,7 @@ import java.io.PrintStream; import java.util.*; public class GenotypingEngine { + private final static Logger logger = Logger.getLogger(GenotypingEngine.class); private final boolean DEBUG; private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; @@ -168,15 +170,15 @@ public class GenotypingEngine { // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file final TreeSet startPosKeySet = new TreeSet(); int count = 0; - if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } + if( DEBUG ) { logger.info("=== Best Haplotypes ==="); } for( final Haplotype h : haplotypes ) { // Walk along the alignment and turn any difference from the reference into an event h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); } if( DEBUG ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() ); - System.out.println( ">> Events = " + h.getEventMap()); + logger.info(h.toString()); + logger.info("> Cigar = " + h.getCigar()); + logger.info(">> Events = " + h.getEventMap()); } } @@ -261,7 +263,7 @@ public class GenotypingEngine { final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); if( DEBUG ) { - System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); + logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); //System.out.println("Event/haplotype allele mapping = " + alleleMapper); } @@ -500,9 +502,9 @@ public class GenotypingEngine { if( isBiallelic ) { final double R2 = calculateR2LD( Math.pow(10.0, x11), Math.pow(10.0, x12), Math.pow(10.0, x21), Math.pow(10.0, x22) ); if( DEBUG ) { - System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2)); - System.out.println("-- " + thisVC); - System.out.println("-- " + nextVC); + logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2)); + logger.info("-- " + thisVC); + logger.info("-- " + nextVC); } if( R2 > MERGE_EVENTS_R2_THRESHOLD ) { @@ -528,7 +530,7 @@ public class GenotypingEngine { if(!containsStart) { startPosKeySet.remove(thisStart); } if(!containsNext) { startPosKeySet.remove(nextStart); } - if( DEBUG ) { System.out.println("====> " + mergedVC); } + if( DEBUG ) { logger.info("====> " + mergedVC); } mapWasUpdated = true; break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 7e2211502..ca105fe03 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -581,7 +581,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem haplotypeBAMWriter.writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, bestHaplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); } - if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } + if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } return 1; // One active region was processed during this map call } @@ -614,7 +614,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem //--------------------------------------------------------------------------------------------------------------- private void finalizeActiveRegion( final ActiveRegion activeRegion ) { - if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } + if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final List finalizedReadList = new ArrayList(); final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); activeRegion.clearReads(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 87b488b3e..51483c53f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -62,6 +63,7 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.*; public class LikelihoodCalculationEngine { + private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class); private static final double LOG_ONE_HALF = -Math.log10(2.0); private final byte constantGCP; @@ -256,14 +258,14 @@ public class LikelihoodCalculationEngine { } } if( maxElement == Double.NEGATIVE_INFINITY ) { break; } - if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } + if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY; if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); } if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); } } - if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } + if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } final List bestHaplotypes = new ArrayList(); for( final int hIndex : bestHaplotypesIndexList ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java similarity index 98% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java index 07a6629d7..6076626f6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import java.io.Serializable; import java.util.Collection; @@ -69,7 +69,7 @@ public class BaseEdge { * @param multiplicity the number of observations of this edge */ public BaseEdge(final boolean isRef, final int multiplicity) { - if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0"); + if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0 but got " + multiplicity); this.multiplicity = multiplicity; this.isRef = isRef; @@ -176,7 +176,7 @@ public class BaseEdge { } /** - * Return a new edge that the max of this and edge + * Return a new edge whose multiplicity is the max of this and edge, and isRef is or of this and edge * * isRef is simply the or of this and edge * multiplicity is the max diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java similarity index 97% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 9872a370b..1d294e591 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; @@ -53,7 +53,6 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; -import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; @@ -221,15 +220,6 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); if( edge.isRef() ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); } @@ -414,7 +404,12 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph edgesToRemove = new ArrayList(); for( final BaseEdge e : edgeSet() ) { if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor @@ -499,7 +494,7 @@ public class BaseGraph extends DefaultDirectedGraph toMerge = graph.incomingVerticesOf(v); + if ( toMerge.size() < 2 ) + // Can only split at least 2 vertices + return false; + else if ( ! safeToSplit(graph, v, toMerge) ) { + return false; + } else { + final SeqVertex suffixVTemplate = commonSuffix(toMerge); + if ( suffixVTemplate.isEmpty() ) { + return false; + } else { + final List edgesToRemove = new LinkedList(); + +// graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0); + for ( final SeqVertex mid : toMerge ) { + // create my own copy of the suffix + final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence()); + graph.addVertex(suffixV); + final SeqVertex prefixV = mid.withoutSuffix(suffixV.getSequence()); + final BaseEdge out = graph.outgoingEdgeOf(mid); + + final SeqVertex incomingTarget; + if ( prefixV == null ) { + // this node is entirely explained by suffix + incomingTarget = suffixV; + } else { + incomingTarget = prefixV; + graph.addVertex(prefixV); + graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0)); + edgesToRemove.add(out); + } + + graph.addEdge(suffixV, graph.getEdgeTarget(out), new BaseEdge(out)); + + for ( final BaseEdge in : graph.incomingEdgesOf(mid) ) { + graph.addEdge(graph.getEdgeSource(in), incomingTarget, new BaseEdge(in)); + edgesToRemove.add(in); + } + } + + graph.removeAllVertices(toMerge); + graph.removeAllEdges(edgesToRemove); +// graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0); + + return true; + } + } + } + +// private static int counter = 0; + + /** + * Can we safely split up the vertices in toMerge? + * + * @param graph a graph + * @param bot a vertex whose incoming vertices we want to split + * @param toMerge the set of vertices we'd be splitting up + * @return true if we can safely split up toMerge + */ + private boolean safeToSplit(final SeqGraph graph, final SeqVertex bot, final Collection toMerge) { + for ( final SeqVertex m : toMerge ) { + final Set outs = graph.outgoingEdgesOf(m); + if ( m == bot || outs.size() != 1 || ! graph.outgoingVerticesOf(m).contains(bot) ) + // m == bot => don't allow cycles in the graph + return false; + } + + return true; + } + + /** + * Return the longest suffix of bases shared among all provided vertices + * + * For example, if the vertices have sequences AC, CC, and ATC, this would return + * a single C. However, for ACC and TCC this would return CC. And for AC and TG this + * would return null; + * + * @param middleVertices a non-empty set of vertices + * @return a single vertex that contains the common suffix of all middle vertices + */ + @Requires("!middleVertices.isEmpty()") + protected static SeqVertex commonSuffix(final Collection middleVertices) { + final List kmers = Utils.getKmers(middleVertices); + final int min = Utils.minKmerLength(kmers); + final int suffixLen = Utils.compSuffixLen(kmers, min); + final byte[] kmer = kmers.get(0); + final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length); + return new SeqVertex(suffix); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java similarity index 88% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java index fd8581254..109598029 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java @@ -44,9 +44,10 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerErrorCorrector; import java.util.Arrays; import java.util.HashMap; @@ -88,39 +89,6 @@ public class DeBruijnGraph extends BaseGraph { } } - /** - * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers - * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason - * we cannot actually do the error correction - */ - protected DeBruijnGraph errorCorrect() { - final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables - - for( final BaseEdge e : edgeSet() ) { - for ( final byte[] kmer : Arrays.asList(getEdgeSource(e).getSequence(), getEdgeTarget(e).getSequence())) { - // TODO -- need a cleaner way to deal with the ref weight - corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); - } - } - - if ( corrector.computeErrorCorrectionMap() ) { - final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize()); - - for( final BaseEdge e : edgeSet() ) { - final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence()); - final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence()); - if ( source != null && target != null ) { - correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); - } - } - - return correctedGraph; - } else { - // the error correction wasn't possible, simply return this graph - return this; - } - } - /** * Add edge to assembly graph connecting the two kmers * @param kmer1 the source kmer for the edge @@ -168,7 +136,7 @@ public class DeBruijnGraph extends BaseGraph { * @return a newly allocated SequenceGraph */ @Ensures({"result != null"}) - protected SeqGraph convertToSequenceGraph() { + public SeqGraph convertToSequenceGraph() { final SeqGraph seqGraph = new SeqGraph(getKmerSize()); final Map vertexMap = new HashMap(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java index 0a2c26ca4..4d9441efe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java index 0724729a8..1dc712c67 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.common.collect.MinMaxPriorityQueue; import com.google.java.contract.Ensures; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index 269adcc22..50ca91d41 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; @@ -67,7 +67,7 @@ import java.util.*; * Time: 2:34 PM * */ -class Path { +public class Path { private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20; // the last vertex seen in the path @@ -163,8 +163,9 @@ class Path { boolean first = true; for ( final T v : getVertices() ) { if ( first ) { - b.append(" -> "); first = false; + } else { + b.append(" -> "); } b.append(v.getSequenceString()); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java similarity index 86% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index da24a06a4..400b5c7ee 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -44,10 +44,12 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.apache.commons.lang.ArrayUtils; +import java.io.File; +import java.util.HashSet; import java.util.Set; /** @@ -57,6 +59,7 @@ import java.util.Set; * @since 03/2013 */ public class SeqGraph extends BaseGraph { + private final static boolean PRINT_SIMPLIFY_GRAPHS = false; private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5; /** @@ -97,9 +100,16 @@ public class SeqGraph extends BaseGraph { //logger.info("simplifyGraph iteration " + i); // iterate until we haven't don't anything useful didSomeWork = false; - //printGraph(new File("simplifyGraph." + i + ".dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".dot"), 0); didSomeWork |= new MergeDiamonds().transformUntilComplete(); didSomeWork |= new MergeTails().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".diamonds_and_tails.dot"), 0); + + didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".split_suffix.dot"), 0); + didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".merge_suffix.dot"), 0); + didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); didSomeWork |= zipLinearChains(); i++; @@ -109,7 +119,7 @@ public class SeqGraph extends BaseGraph { /** * Zip up all of the simple linear chains present in this graph. */ - protected boolean zipLinearChains() { + public boolean zipLinearChains() { boolean foundOne = false; while( zipOneLinearChain() ) { // just keep going until zipOneLinearChain says its done @@ -137,13 +147,16 @@ public class SeqGraph extends BaseGraph { final Set outEdges = outgoingEdgesOf(outgoingVertex); final Set inEdges = incomingEdgesOf(incomingVertex); + final BaseEdge singleOutEdge = outEdges.isEmpty() ? null : outEdges.iterator().next(); + final BaseEdge singleInEdge = inEdges.isEmpty() ? null : inEdges.iterator().next(); + if( inEdges.size() == 1 && outEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + singleInEdge.setMultiplicity( singleInEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + singleOutEdge.setMultiplicity( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) ); } else if( inEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + singleInEdge.setMultiplicity( Math.max(singleInEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) ); } else if( outEdges.size() == 1 ) { - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + singleOutEdge.setMultiplicity( Math.max( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) ); } final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); @@ -297,6 +310,57 @@ public class SeqGraph extends BaseGraph { } } + /** + * Merge headless configurations: + * + * Performs the transformation: + * + * { x + S_i + y -> Z } + * + * goes to: + * + * { x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no top node is required + */ + protected class MergeCommonSuffices extends VertexBasedTransformer { + @Override + boolean tryToTransform(final SeqVertex bottom) { + return new SharedSequenceMerger().merge(SeqGraph.this, bottom); + } + } + + /** + * Merge headless configurations: + * + * Performs the transformation: + * + * { x + S_i + y -> Z } + * + * goes to: + * + * { x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no top node is required + */ + protected class SplitCommonSuffices extends VertexBasedTransformer { + final Set alreadySplit = new HashSet(); + + @Override + boolean tryToTransform(final SeqVertex bottom) { + if ( alreadySplit.contains(bottom) ) + return false; + else { + alreadySplit.add(bottom); + return new CommonSuffixSplitter().split(SeqGraph.this, bottom); + } + } + } + /** * Merge headless configurations: * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java index 523312dcf..cfc2abfdc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java @@ -44,11 +44,10 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.Utils; - import java.util.Arrays; /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java new file mode 100644 index 000000000..1c53f2332 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java @@ -0,0 +1,138 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.apache.commons.lang.ArrayUtils; + +import java.util.*; + +/** + * Merges the incoming vertices of a vertex V of a graph + * + * Looks at the vertices that are incoming to V (i.e., have an outgoing edge connecting to V). If + * they all have the same sequence, merges them into the sequence of V, and updates the graph + * as appropriate + * + * User: depristo + * Date: 3/22/13 + * Time: 8:31 AM + */ +public class SharedSequenceMerger { + public SharedSequenceMerger() { } + + /** + * Attempt to merge the incoming vertices of v + * + * @param graph the graph containing the vertex v + * @param v the vertex whose incoming vertices we want to merge + * @return true if some useful merging was done, false otherwise + */ + public boolean merge(final SeqGraph graph, final SeqVertex v) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex " + v); + + final Set prevs = graph.incomingVerticesOf(v); + if ( ! canMerge(graph, v, prevs) ) + return false; + else { +// graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0); + + final List edgesToRemove = new LinkedList(); + final byte[] prevSeq = prevs.iterator().next().getSequence(); + final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence())); + graph.addVertex(newV); + + for ( final SeqVertex prev : prevs ) { + for ( final BaseEdge prevIn : graph.incomingEdgesOf(prev) ) { + graph.addEdge(graph.getEdgeSource(prevIn), newV, new BaseEdge(prevIn)); + edgesToRemove.add(prevIn); + } + } + + for ( final BaseEdge e : graph.outgoingEdgesOf(v) ) { + graph.addEdge(newV, graph.getEdgeTarget(e), new BaseEdge(e)); + } + + graph.removeAllVertices(prevs); + graph.removeVertex(v); + graph.removeAllEdges(edgesToRemove); + +// graph.printGraph(new File("csm." + counter++ + "." + v.getSequenceString() + "_post.dot"), 0); + + return true; + } + } + + //private static int counter = 0; + + /** + * Can we safely merge the incoming vertices of v + * + * @param graph the graph containing v and incomingVertices + * @param v the vertex we want to merge into + * @param incomingVertices the incoming vertices of v + * @return true if we can safely merge incomingVertices + */ + private boolean canMerge(final SeqGraph graph, final SeqVertex v, final Collection incomingVertices) { + if ( incomingVertices.isEmpty() ) + return false; + + final SeqVertex first = incomingVertices.iterator().next(); + for ( final SeqVertex prev : incomingVertices) { + if ( ! prev.seqEquals(first) ) + return false; + final Collection prevOuts = graph.outgoingVerticesOf(prev); + if ( prevOuts.size() != 1 ) + return false; + if ( prevOuts.iterator().next() != v ) + return false; + } + + return true; + } + +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java similarity index 91% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java index e0501da52..9834653a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; @@ -266,8 +266,8 @@ public class SharedVertexSequenceSplitter { min = Math.min(min, v.getSequence().length); } - final int prefixLen = compPrefixLen(kmers, min); - final int suffixLen = compSuffixLen(kmers, min - prefixLen); + final int prefixLen = Utils.compPrefixLen(kmers, min); + final int suffixLen = Utils.compSuffixLen(kmers, min - prefixLen); final byte[] kmer = kmers.get(0); final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen); @@ -275,47 +275,6 @@ public class SharedVertexSequenceSplitter { return new Pair(new SeqVertex(prefix), new SeqVertex(suffix)); } - /** - * Compute the maximum shared prefix length of list of bytes. - * - * @param listOfBytes a list of bytes with at least one element - * @param minLength the min. length among all byte[] in listOfBytes - * @return the number of shared bytes common at the start of all bytes - */ - @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) - @Ensures("result >= 0") - protected static int compPrefixLen(final List listOfBytes, final int minLength) { - for ( int i = 0; i < minLength; i++ ) { - final byte b = listOfBytes.get(0)[i]; - for ( int j = 1; j < listOfBytes.size(); j++ ) { - if ( b != listOfBytes.get(j)[i] ) - return i; - } - } - - return minLength; - } - - /** - * Compute the maximum shared suffix length of list of bytes. - * - * @param listOfBytes a list of bytes with at least one element - * @param minLength the min. length among all byte[] in listOfBytes - * @return the number of shared bytes common at the end of all bytes - */ - @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) - @Ensures("result >= 0") - protected static int compSuffixLen(final List listOfBytes, final int minLength) { - for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) { - final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1]; - for ( int j = 1; j < listOfBytes.size(); j++ ) { - if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] ) - return suffixLen; - } - } - return minLength; - } - /** * Helper function that returns an edge that we should use for splitting * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java new file mode 100644 index 000000000..8cb272925 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java @@ -0,0 +1,138 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Utility functions used in the graphs package + * + * User: depristo + * Date: 3/25/13 + * Time: 9:42 PM + */ +final class Utils { + private Utils() {} + + /** + * Compute the maximum shared prefix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the start of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compPrefixLen(final List listOfBytes, final int minLength) { + for ( int i = 0; i < minLength; i++ ) { + final byte b = listOfBytes.get(0)[i]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[i] ) + return i; + } + } + + return minLength; + } + + /** + * Compute the maximum shared suffix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the end of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compSuffixLen(final List listOfBytes, final int minLength) { + for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) { + final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] ) + return suffixLen; + } + } + return minLength; + } + + /** + * Get the list of kmers as byte[] from the vertices in the graph + * + * @param vertices a collection of vertices + * @return a list of their kmers in order of the iterator on vertices + */ + protected static List getKmers(final Collection vertices) { + final List kmers = new ArrayList(vertices.size()); + for ( final SeqVertex v : vertices ) { + kmers.add(v.getSequence()); + } + return kmers; + } + + /** + * Get the minimum length of a collection of byte[] + * + * @param kmers a list of kmers whose .length min we want + * @return the min of the kmers, if kmers is empty the result is 0 + */ + protected static int minKmerLength(final Collection kmers) { + if ( kmers == null ) throw new IllegalArgumentException("kmers cannot be null"); + + if ( kmers.isEmpty() ) return 0; + int min = Integer.MAX_VALUE; + for ( final byte[] kmer : kmers ) { + min = Math.min(min, kmer.length); + } + return min; + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index cce623b76..86d331dae 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -56,6 +56,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java index 2b87cf61d..a13618dae 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java similarity index 92% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java index 3cc44c7de..7df6ee6c8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -102,4 +102,20 @@ public class BaseEdgeUnitTest extends BaseTest { Assert.assertEquals(edges.get(2), e2); Assert.assertEquals(edges.get(3), e1); } + + @Test + public void testMax() { + for ( final boolean firstIsRef : Arrays.asList(true, false) ) { + for ( final boolean secondIsRef : Arrays.asList(true, false) ) { + for ( final int firstMulti : Arrays.asList(1, 4) ) { + for ( final int secondMulti : Arrays.asList(2, 3) ) { + final BaseEdge expected = new BaseEdge(firstIsRef || secondIsRef, Math.max(firstMulti, secondMulti)); + final BaseEdge actual = new BaseEdge(firstIsRef, firstMulti).max(new BaseEdge(secondIsRef, secondMulti)); + Assert.assertEquals(actual.getMultiplicity(), expected.getMultiplicity()); + Assert.assertEquals(actual.isRef(), expected.isRef()); + } + } + } + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java similarity index 86% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java index db4127ddb..9737f72f5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -160,7 +160,7 @@ public class BaseGraphUnitTest extends BaseTest { graph.addEdges(g3, b7, b8, b7); // cycle is bad graph.addEdges(g3, b9, b9); // self-cycle is bad - final boolean debug = true; + final boolean debug = false; if ( debug ) good.printGraph(new File("expected.dot"), 0); if ( debug ) graph.printGraph(new File("bad.dot"), 0); graph.removePathsNotConnectedToRef(); @@ -169,6 +169,63 @@ public class BaseGraphUnitTest extends BaseTest { Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); } + @Test + public void testRemoveVerticesNotConnectedToRefRegardlessOfEdgeDirection() throws Exception { + final SeqGraph graph = new SeqGraph(); + + SeqVertex src = new SeqVertex("A"); + SeqVertex end = new SeqVertex("A"); + SeqVertex g1 = new SeqVertex("C"); + SeqVertex g2 = new SeqVertex("G"); + SeqVertex g3 = new SeqVertex("T"); + SeqVertex g4 = new SeqVertex("AA"); + SeqVertex g5 = new SeqVertex("AA"); + SeqVertex g6 = new SeqVertex("AA"); + SeqVertex g8 = new SeqVertex("AA"); + SeqVertex g7 = new SeqVertex("AA"); + SeqVertex gPrev = new SeqVertex("AA"); + SeqVertex gPrev1 = new SeqVertex("AA"); + SeqVertex gPrev2 = new SeqVertex("AA"); + SeqVertex gAfter = new SeqVertex("AA"); + SeqVertex gAfter1 = new SeqVertex("AA"); + SeqVertex gAfter2 = new SeqVertex("AA"); + SeqVertex b1 = new SeqVertex("CC"); + SeqVertex b2 = new SeqVertex("GG"); + SeqVertex b3 = new SeqVertex("TT"); + SeqVertex b4 = new SeqVertex("AAA"); + SeqVertex b5 = new SeqVertex("CCC"); + SeqVertex b6 = new SeqVertex("GGG"); + + graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8, gPrev, gPrev1, gPrev2, gAfter, gAfter1, gAfter2); + graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end); + graph.addEdges(src, g1, g5, g6, g7, end); + graph.addEdges(src, g1, g5, g8, g7, end); + graph.addEdges(src, g1, g3, end); + + // these should be kept, but are in the wrong direction + graph.addEdges(gPrev, src); + graph.addEdges(gPrev1, gPrev2, src); + graph.addEdges(end, gAfter); + graph.addEdges(end, gAfter1, gAfter2); + + // the current state of the graph is the good one + final SeqGraph good = (SeqGraph)graph.clone(); + + // now add the bads to the graph + graph.addVertices(b1, b2, b3, b4, b5, b6); + graph.addEdges(b2, b3); // b2 -> b3 + graph.addEdges(b4, b5, b4); // cycle + graph.addEdges(b6, b6); // isolated self cycle + + final boolean debug = false; + if ( debug ) good.printGraph(new File("expected.dot"), 0); + if ( debug ) graph.printGraph(new File("bad.dot"), 0); + graph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); + if ( debug ) graph.printGraph(new File("actual.dot"), 0); + + Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); + } + @Test public void testPrintEmptyGraph() throws Exception { final File tmp = File.createTempFile("tmp", "dot"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java index 8f682d474..859892e33 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java new file mode 100644 index 000000000..012add769 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -0,0 +1,160 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class CommonSuffixMergerUnitTest extends BaseTest { + private final static boolean PRINT_GRAPHS = true; + + @DataProvider(name = "CompleteCycleData") + public Object[][] makeCompleteCycleData() { + return makeSplitMergeData(-1); + } + + public static class SplitMergeData { + final SeqGraph graph; + final SeqVertex v; + final String commonSuffix; + + public SplitMergeData(SeqGraph graph, SeqVertex v, String commonSuffix) { + this.graph = graph; + this.v = v; + this.commonSuffix = commonSuffix; + } + } + + public static Object[][] makeSplitMergeData(final int maxTests) { + List tests = new ArrayList(); + + final List bases = Arrays.asList("A", "C", "G", "T"); + for ( final String commonSuffix : Arrays.asList("", "A", "AT") ) { + for ( final int nBots : Arrays.asList(0, 1, 2) ) { + for ( final int nMids : Arrays.asList(1, 2, 3) ) { + for ( int nTops = 0; nTops < nMids; nTops++ ) { + for ( int nTopConnections = 1; nTopConnections <= nMids; nTopConnections++ ) { + int multi = 1; + final SeqGraph graph = new SeqGraph(); + final SeqVertex v = new SeqVertex("GGGG"); + graph.addVertex(v); + + final LinkedList tops = new LinkedList(); + final LinkedList mids = new LinkedList(); + + for ( int i = 0; i < nMids; i++) { + final SeqVertex mid = new SeqVertex(bases.get(i) + commonSuffix); + graph.addVertex(mid); + graph.addEdge(mid, v, new BaseEdge(i == 0, multi++)); + mids.add(mid); + + tops.add(new SeqVertex(bases.get(i))); + } + + graph.addVertices(tops); + for ( final SeqVertex t : tops ) { + for ( int i = 0; i < nTopConnections; i++ ) { + graph.addEdge(t, mids.get(i), new BaseEdge(i == 0, multi++)); + } + } + + for ( int i = 0; i < nBots; i++ ) { + final SeqVertex bot = new SeqVertex(bases.get(i)); + graph.addVertex(bot); + graph.addEdge(v, bot, new BaseEdge(i == 0, multi++)); + + } + + tests.add(new Object[]{new SplitMergeData(graph, v, commonSuffix)}); + } + } + } + } + } + + final List toUse = maxTests == -1 ? tests : tests.subList(0, Math.min(tests.size(), maxTests)); + return toUse.toArray(new Object[][]{}); + } + + public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) { + try { + final Set haplotypes = new HashSet(); + final List> originalPaths = new KBestPaths().getKBestPaths(original); + for ( final Path path : originalPaths ) + haplotypes.add(new String(path.getBases())); + + final List> splitPaths = new KBestPaths().getKBestPaths(actual); + for ( final Path path : splitPaths ) { + final String h = new String(path.getBases()); + Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); + } + + if ( splitPaths.size() == originalPaths.size() ) { + for ( int i = 0; i < originalPaths.size(); i++ ) { + Assert.assertTrue(splitPaths.get(i).equalSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); + } + } + } catch ( AssertionError e ) { + if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0); + if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0); + throw e; + } + } + + @Test(dataProvider = "CompleteCycleData") + public void testMerging(final SplitMergeData data) { + final SeqGraph original = (SeqGraph)data.graph.clone(); + final SharedSequenceMerger splitter = new SharedSequenceMerger(); + splitter.merge(data.graph, data.v); + assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java new file mode 100644 index 000000000..f03dc8762 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java @@ -0,0 +1,113 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class CommonSuffixSplitterUnitTest extends BaseTest { + @DataProvider(name = "SplitData") + public Object[][] makeSplitData() { + return CommonSuffixMergerUnitTest.makeSplitMergeData(-1); + } + + @Test(dataProvider = "SplitData") + public void testSplit(final CommonSuffixMergerUnitTest.SplitMergeData data) { + final boolean expectedMerge = ! data.commonSuffix.isEmpty() && data.graph.inDegreeOf(data.v) > 1; + + final SeqGraph original = (SeqGraph)data.graph.clone(); +// original.printGraph(new File("original.dot"), 0); + final CommonSuffixSplitter splitter = new CommonSuffixSplitter(); + final boolean succeed = splitter.split(data.graph, data.v); +// data.graph.printGraph(new File("actual.dot"), 0); + Assert.assertEquals(succeed, expectedMerge, "Not excepted merge success/fail result"); + if ( succeed ) { + Assert.assertEquals(data.graph.incomingVerticesOf(data.v).iterator().next().getSequenceString(), data.commonSuffix, "Common suffix not computed correctly"); + } + + CommonSuffixMergerUnitTest.assertSameHaplotypes(String.format("suffixSplit.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); + } + + @Test + public void testSplitPrevHaveMultipleEdges() { + final SeqGraph original = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("A"); + final SeqVertex v3 = new SeqVertex("A"); + final SeqVertex v4 = new SeqVertex("A"); + + original.addVertices(v1, v2, v3, v4); + original.addEdges(v1, v3); + + Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with only one vertex"); + + original.addEdges(v2, v3); + original.addEdges(v2, v4); + + Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with multiple outgoing edges from middle nodes"); + } + + @Test + public void testSplitNoCycles() { + final SeqGraph original = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("C"); + final SeqVertex v3 = new SeqVertex("C"); + final SeqVertex v4 = new SeqVertex("G"); + + original.addVertices(v1, v2, v3, v4); + original.addEdges(v1, v3, v4); + original.addEdges(v1, v2, v4); + + Assert.assertTrue(new CommonSuffixSplitter().split((SeqGraph)original.clone(), v4), "Should be able to split pre-cycle graph"); + + original.addEdges(v4, v4); + Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java index dfbe50668..bdc8ab36d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.annotations.Test; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index 34b4ba912..d20a0f778 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java similarity index 97% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index 6b6826e45..cbd7b1063 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -1,50 +1,50 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -315,12 +315,16 @@ public class SeqGraphUnitTest extends BaseTest { public void testMerging(final SeqGraph graph, final SeqGraph expected) { final SeqGraph merged = (SeqGraph)graph.clone(); merged.simplifyGraph(1); -// if ( ! SeqGraph.graphEquals(merged, expected) ) { -// graph.printGraph(new File("graph.dot"), 0); -// merged.printGraph(new File("merged.dot"), 0); -// expected.printGraph(new File("expected.dot"), 0); -// } - Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); + try { + Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); + } catch (AssertionError e) { +// if ( ! SeqGraph.graphEquals(merged, expected) ) { +// graph.printGraph(new File("graph.dot"), 0); +// merged.printGraph(new File("merged.dot"), 0); +// expected.printGraph(new File("expected.dot"), 0); +// } + throw e; + } } // A -> ACT -> C [non-ref] diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java index ca38351cc..eab9dfc27 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -52,7 +52,6 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; public class SeqVertexUnitTest extends BaseTest { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java similarity index 98% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index 52ab36064..77857c367 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; @@ -98,10 +98,10 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { min = Math.min(min, s.length()); } - final int actualPrefixLen = SharedVertexSequenceSplitter.compPrefixLen(bytes, min); + final int actualPrefixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compPrefixLen(bytes, min); Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test"); - final int actualSuffixLen = SharedVertexSequenceSplitter.compSuffixLen(bytes, min - actualPrefixLen); + final int actualSuffixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compSuffixLen(bytes, min - actualPrefixLen); Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test"); } From 197d149495901a08ba07e87d57ed1af656af8bce Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 26 Mar 2013 21:00:04 -0400 Subject: [PATCH 12/13] Increase the maxNumHaplotypesInPopulation to 25 -- A somewhat arbitrary increase, and will need some evaluation but necessary to get good results on the AFR integrationtest. --- .../haplotypecaller/HaplotypeCaller.java | 21 ++++++++++++++++--- .../LikelihoodCalculationEngine.java | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index ca105fe03..c379b34dc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -204,7 +204,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Advanced @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) - protected int maxNumHaplotypesInPopulation = 13; + protected int maxNumHaplotypesInPopulation = 25; @Advanced @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) @@ -557,8 +557,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final List bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? - likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation ) : haplotypes ); + final List bestHaplotypes = selectBestHaplotypesForGenotyping(haplotypes, stratifiedReadMap); final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, bestHaplotypes, @@ -586,6 +585,22 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return 1; // One active region was processed during this map call } + /** + * Select the best N haplotypes according to their likelihoods, if appropriate + * + * @param haplotypes a list of haplotypes to consider + * @param stratifiedReadMap a map from samples -> read likelihoods + * @return the list of haplotypes to genotype + */ + protected List selectBestHaplotypesForGenotyping(final List haplotypes, final Map stratifiedReadMap) { + // TODO -- skip this calculation if the list of haplotypes is of size 2 (as we'll always use 2 for genotyping) + if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + return haplotypes; + } else { + return likelihoodCalculationEngine.selectBestHaplotypesFromPooledLikelihoods(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); + } + } + //--------------------------------------------------------------------------------------------------------------- // // reduce diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 51483c53f..5eaaba0dd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -231,7 +231,7 @@ public class LikelihoodCalculationEngine { @Requires({"haplotypes.size() > 0"}) @Ensures({"result.size() <= haplotypes.size()"}) - public List selectBestHaplotypes( final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation ) { + public List selectBestHaplotypesFromPooledLikelihoods(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { final int numHaplotypes = haplotypes.size(); final Set sampleKeySet = stratifiedReadMap.keySet(); From fde7d36926b1ace076d54a01086714c25ab704ed Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 26 Mar 2013 22:07:25 -0400 Subject: [PATCH 13/13] Updating md5s due to changes in assembly graph creation algorithms and default parameter --- ...omplexAndSymbolicVariantsIntegrationTest.java | 8 ++++---- .../HaplotypeCallerIntegrationTest.java | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 830152903..3aaffdeaa 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6dd29d6fec056419ab0fa03a7d43d85e"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9fa4d3c88fd9c0f23c7a3ddd3d24a8c"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -75,7 +75,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa // TODO -- need a better symbolic allele test @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "8225fb59b9fcbe767a473c9eb8b21537"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "e746a38765298acd716194aee4d93554"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -87,12 +87,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "84616464aed68f4d9bc9e08472eff9c0"); + "e8ffbfae3c1af5be02631a31f386a431"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "e2d1023b846bfac31b4f7a3a4b90d931"); + "c3a98b19efa7cb36fe5f5f2ab893ef56"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 1b98b2239..c5614d405 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "9859b136d05085b5ec0833035289106a"); + HCTest(CEUTRIO_BAM, "", "45856ad67bfe8d8bea45808d8258bcf1"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "27f660bf1c9a6ed7167d77022d401b73"); + HCTest(NA12878_BAM, "", "b6c93325f851ac358ea49260fb11b75c"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "e25fc2196401a16347e0c730dbcbe828"); + "4ca6b560d0569cdca400d3e50915e211"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "325d7d73e0bd86b6cb146b249eda959a"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5d06ec5502d3f157964bd7b275d6a0cb"); } @Test @@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("53a50dae68f0175ca3088dea1d3bb881")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec97a0a65890169358842e765ff8dd15")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("d3bc6adde8cd9514ae5c49cd366d5de4")); executeTest("HCTestStructuralIndels: ", spec); } @@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("5280f1a50ca27d8e435da0bd5b26ae93")); + Arrays.asList("4adb833ed8af20224b76bba61e2b0d93")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -148,7 +148,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("addceb63f5bfa9f11e15335d5bf641e9")); + Arrays.asList("1704b0901c86f8f597d931222d5c8dd8")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } }