HaplotypeCaller now uses SeqGraph instead of kmer graph to build haplotypes.

-- DeBruijnAssembler functions are no longer static.  This isn't the right way to unit test your code
-- An a HaplotypeCaller command line option to use low-quality bases in the assembly
-- Refactored DeBruijnGraph and associated libraries into base class
-- Refactored out BaseEdge, BaseGraph, and BaseVertex from DeBruijn equivalents.  These DeBruijn versions now inherit from these base classes.  Added some reasonable unit tests for the base and Debruijn edges and vertex classes.
-- SeqVertex: allows multiple vertices in the sequence graph to have the same sequence and yet be distinct
-- Further refactoring of DeBruijnAssembler in preparation for the full SeqGraph <-> DeBruijnGraph split
-- Moved generic methods in DeBruijnAssembler into BaseGraph
-- Created a simple SeqGraph that contains SeqVertex objects
-- Simple chain zipper for SeqGraph that reproduces the results for the mergeNode function on DeBruijnGraphs
-- A working version of the diamond remodeling algorithm in SeqGraph that converts graphs that look like A -> Xa, A -> Ya, Xa -> Z, Ya -> Z into A -> X -> a, A -Y -> a, a -> Z
-- Allow SeqGraph zip merging of vertices where the in vertex has multiple incoming edges or the out vertex has multiple outgoing edges
-- Fix all unit tests so they work with the new SeqGraph system.  All tests passed without modification.
-- Debugging makes it easier to tell which kmer graph contributes to a haplotype
-- Better docs and unit tests for BaseVertex, SeqVertex, BaseEdge, and KMerErrorCorrector
-- Remove unnecessary printing of cleaning info in BaseGraph
-- Turn off kmer graph creation in DeBruijnAssembler.java
-- Only print SeqGraphs when debugGraphTransformations is set to true
-- Rename DeBruijnGraphUnitTest to SeqGraphUnitTest.  Now builds DeBruijnGraph, converts to SeqGraph, uses SeqGraph.mergenodes and tests for equality.
-- Update KBestPathsUnitTest to use SeqGraphs not DebruijnGraphs
-- DebruijnVertex now longer takes kmer argument -- it's implicit that the kmer length is the sequence.length now
This commit is contained in:
Mark DePristo 2013-03-14 10:03:04 -04:00
parent 0f4328f6fe
commit 98c4cd060d
22 changed files with 1964 additions and 732 deletions

View File

@ -46,68 +46,94 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.jgrapht.graph.DefaultDirectedGraph;
import java.io.Serializable;
import java.util.Comparator;
/**
* Created by IntelliJ IDEA.
* simple edge class for connecting nodes in the graph
*
* Works equally well for all graph types (kmer or sequence)
*
* User: ebanks
* Date: Mar 23, 2011
*/
// simple edge class for connecting nodes in the graph
public class DeBruijnEdge {
public class BaseEdge {
private int multiplicity;
private boolean isRef;
public DeBruijnEdge() {
multiplicity = 1;
isRef = false;
}
/**
* Create a new BaseEdge with weight multiplicity and, if isRef == true, indicates a path through the reference
*
* @param isRef indicates whether this edge is a path through the reference
* @param multiplicity the number of observations of this edge
*/
public BaseEdge(final boolean isRef, final int multiplicity) {
if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0");
public DeBruijnEdge( final boolean isRef ) {
multiplicity = 1;
this.isRef = isRef;
}
public DeBruijnEdge( final boolean isRef, final int multiplicity ) {
this.multiplicity = multiplicity;
this.isRef = isRef;
}
/**
* Copy constructor
*
* @param toCopy
*/
public BaseEdge(final BaseEdge toCopy) {
this(toCopy.isRef(), toCopy.getMultiplicity());
}
/**
* Get the number of observations of paths connecting two vertices
* @return a positive integer >= 0
*/
public int getMultiplicity() {
return multiplicity;
}
/**
* Set the multiplicity of this edge to value
* @param value an integer >= 0
*/
public void setMultiplicity( final int value ) {
if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0");
multiplicity = value;
}
/**
* Does this edge indicate a path through the reference graph?
* @return true if so
*/
public boolean isRef() {
return isRef;
}
/**
* Indicate that this edge follows the reference sequence, or not
* @param isRef true if this is a reference edge
*/
public void setIsRef( final boolean isRef ) {
this.isRef = isRef;
}
// For use when comparing edges pulled from the same graph
public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge ) {
public <T extends BaseVertex> boolean equals( final BaseGraph<T> graph, final BaseEdge edge ) {
return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
}
// For use when comparing edges across graphs!
public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge, final DeBruijnAssemblyGraph graph2 ) {
public <T extends BaseVertex> boolean equals( final BaseGraph<T> graph, final BaseEdge edge, final BaseGraph<T> graph2 ) {
return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
}
public static class EdgeWeightComparator implements Comparator<DeBruijnEdge>, Serializable {
/**
* Sorts a collection of BaseEdges in decreasing order of weight, so that the most
* heavily weighted is at the start of the list
*/
public static class EdgeWeightComparator implements Comparator<BaseEdge>, Serializable {
@Override
public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) {
return edge1.multiplicity - edge2.multiplicity;
public int compare(final BaseEdge edge1, final BaseEdge edge2) {
return edge2.multiplicity - edge1.multiplicity;
}
}
}

View File

@ -49,13 +49,15 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.jgrapht.EdgeFactory;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.traverse.DepthFirstIterator;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.*;
/**
* Created with IntelliJ IDEA.
@ -63,44 +65,37 @@ import java.util.Arrays;
* Date: 2/6/13
*/
public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> {
private final static Logger logger = Logger.getLogger(DeBruijnAssemblyGraph.class);
public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, BaseEdge> {
protected final static Logger logger = Logger.getLogger(BaseGraph.class);
private final int kmerSize;
/**
* Construct a DeBruijnAssemblyGraph with kmerSize
* @param kmerSize
* Construct an empty BaseGraph
*/
public DeBruijnAssemblyGraph(final int kmerSize) {
super(DeBruijnEdge.class);
if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize);
this.kmerSize = kmerSize;
}
public static DeBruijnAssemblyGraph parse(final int kmerSize, final int multiplicity, final String ... reads) {
final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(kmerSize);
for ( final String read : reads ) {
final int kmersInSequence = read.length() - kmerSize + 1;
for (int i = 0; i < kmersInSequence - 1; i++) {
// get the kmers
final byte[] kmer1 = new byte[kmerSize];
System.arraycopy(read.getBytes(), i, kmer1, 0, kmerSize);
final byte[] kmer2 = new byte[kmerSize];
System.arraycopy(read.getBytes(), i+1, kmer2, 0, kmerSize);
graph.addKmersToGraph(kmer1, kmer2, false, multiplicity);
}
}
return graph;
public BaseGraph() {
this(11);
}
/**
* Test construct that makes DeBruijnAssemblyGraph assuming a kmerSize of 11
* Edge factory that creates non-reference multiplicity 1 edges
* @param <T> the new of our vertices
*/
protected DeBruijnAssemblyGraph() {
this(11);
private static class MyEdgeFactory<T extends BaseVertex> implements EdgeFactory<T, BaseEdge> {
@Override
public BaseEdge createEdge(T sourceVertex, T targetVertex) {
return new BaseEdge(false, 1);
}
}
/**
* Construct a DeBruijnGraph with kmerSize
* @param kmerSize
*/
public BaseGraph(final int kmerSize) {
super(new MyEdgeFactory<T>());
if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize);
this.kmerSize = kmerSize;
}
/**
@ -115,9 +110,9 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param v the vertex to test
* @return true if this vertex is a reference node (meaning that it appears on the reference path in the graph)
*/
public boolean isReferenceNode( final DeBruijnVertex v ) {
public boolean isReferenceNode( final T v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
for( final DeBruijnEdge e : edgesOf(v) ) {
for( final BaseEdge e : edgesOf(v) ) {
if( e.isRef() ) { return true; }
}
return false;
@ -127,7 +122,7 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param v the vertex to test
* @return true if this vertex is a source node (in degree == 0)
*/
public boolean isSource( final DeBruijnVertex v ) {
public boolean isSource( final T v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
return inDegreeOf(v) == 0;
}
@ -136,7 +131,7 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param v the vertex to test
* @return true if this vertex is a sink node (out degree == 0)
*/
public boolean isSink( final DeBruijnVertex v ) {
public boolean isSink( final T v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
return outDegreeOf(v) == 0;
}
@ -147,18 +142,18 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @return non-null byte array
*/
@Ensures({"result != null"})
public byte[] getAdditionalSequence( final DeBruijnVertex v ) {
public byte[] getAdditionalSequence( final T v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to pull sequence from a null vertex."); }
return ( isSource(v) ? v.getSequence() : v.getSuffix() );
return v.getAdditionalSequence(isSource(v));
}
/**
* @param e the edge to test
* @return true if this edge is a reference source edge
*/
public boolean isRefSource( final DeBruijnEdge e ) {
public boolean isRefSource( final BaseEdge e ) {
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
for( final DeBruijnEdge edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) {
for( final BaseEdge edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) {
if( edgeToTest.isRef() ) { return false; }
}
return true;
@ -168,9 +163,9 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param v the vertex to test
* @return true if this vertex is a reference source
*/
public boolean isRefSource( final DeBruijnVertex v ) {
public boolean isRefSource( final T v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
for( final DeBruijnEdge edgeToTest : incomingEdgesOf(v) ) {
for( final BaseEdge edgeToTest : incomingEdgesOf(v) ) {
if( edgeToTest.isRef() ) { return false; }
}
return true;
@ -180,9 +175,9 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param e the edge to test
* @return true if this edge is a reference sink edge
*/
public boolean isRefSink( final DeBruijnEdge e ) {
public boolean isRefSink( final BaseEdge e ) {
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) {
for( final BaseEdge edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) {
if( edgeToTest.isRef() ) { return false; }
}
return true;
@ -192,9 +187,9 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param v the vertex to test
* @return true if this vertex is a reference sink
*/
public boolean isRefSink( final DeBruijnVertex v ) {
public boolean isRefSink( final T v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(v) ) {
for( final BaseEdge edgeToTest : outgoingEdgesOf(v) ) {
if( edgeToTest.isRef() ) { return false; }
}
return true;
@ -203,8 +198,8 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
/**
* @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph
*/
public DeBruijnVertex getReferenceSourceVertex( ) {
for( final DeBruijnVertex v : vertexSet() ) {
public T getReferenceSourceVertex( ) {
for( final T v : vertexSet() ) {
if( isReferenceNode(v) && isRefSource(v) ) {
return v;
}
@ -215,8 +210,8 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
/**
* @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph
*/
public DeBruijnVertex getReferenceSinkVertex( ) {
for( final DeBruijnVertex v : vertexSet() ) {
public T getReferenceSinkVertex( ) {
for( final T v : vertexSet() ) {
if( isReferenceNode(v) && isRefSink(v) ) {
return v;
}
@ -229,9 +224,9 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param v the current vertex, can be null
* @return the next reference vertex if it exists
*/
public DeBruijnVertex getNextReferenceVertex( final DeBruijnVertex v ) {
public T getNextReferenceVertex( final T v ) {
if( v == null ) { return null; }
for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(v) ) {
for( final BaseEdge edgeToTest : outgoingEdgesOf(v) ) {
if( edgeToTest.isRef() ) {
return getEdgeTarget(edgeToTest);
}
@ -244,9 +239,9 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param v the current vertex, can be null
* @return the previous reference vertex if it exists
*/
public DeBruijnVertex getPrevReferenceVertex( final DeBruijnVertex v ) {
public T getPrevReferenceVertex( final T v ) {
if( v == null ) { return null; }
for( final DeBruijnEdge edgeToTest : incomingEdgesOf(v) ) {
for( final BaseEdge edgeToTest : incomingEdgesOf(v) ) {
if( isReferenceNode(getEdgeSource(edgeToTest)) ) {
return getEdgeSource(edgeToTest);
}
@ -260,8 +255,8 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param toVertex to this vertex, can be null
* @return true if a reference path exists in the graph between the two vertices
*/
public boolean referencePathExists(final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex) {
DeBruijnVertex v = fromVertex;
public boolean referencePathExists(final T fromVertex, final T toVertex) {
T v = fromVertex;
if( v == null ) {
return false;
}
@ -286,12 +281,12 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
* @param includeStop should the ending vertex be included in the path
* @return byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example)
*/
public byte[] getReferenceBytes( final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex, final boolean includeStart, final boolean includeStop ) {
public byte[] getReferenceBytes( final T fromVertex, final T toVertex, final boolean includeStart, final boolean includeStop ) {
if( fromVertex == null ) { throw new IllegalArgumentException("Starting vertex in requested path cannot be null."); }
if( toVertex == null ) { throw new IllegalArgumentException("From vertex in requested path cannot be null."); }
byte[] bytes = null;
DeBruijnVertex v = fromVertex;
T v = fromVertex;
if( includeStart ) {
bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v));
}
@ -306,86 +301,41 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
return bytes;
}
/**
* Pull kmers out of the given long sequence and throw them on in the graph
* @param sequence byte array holding the sequence with which to build the assembly graph
* @param KMER_LENGTH the desired kmer length to use
* @param isRef if true the kmers added to the graph will have reference edges linking them
*/
public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) {
if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); }
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1);
}
}
protected DeBruijnAssemblyGraph errorCorrect() {
final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables
for( final DeBruijnEdge e : edgeSet() ) {
for ( final byte[] kmer : Arrays.asList(getEdgeSource(e).getSequence(), getEdgeTarget(e).getSequence())) {
// TODO -- need a cleaner way to deal with the ref weight
corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity());
}
}
corrector.computeErrorCorrectionMap();
//logger.info(corrector);
final DeBruijnAssemblyGraph correctedGraph = new DeBruijnAssemblyGraph(getKmerSize());
for( final DeBruijnEdge e : edgeSet() ) {
final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence());
final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence());
if ( source != null && target != null ) {
correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity());
}
}
return correctedGraph;
}
/**
* Add edge to assembly graph connecting the two kmers
* @param kmer1 the source kmer for the edge
* @param kmer2 the target kmer for the edge
* @param isRef true if the added edge is a reference edge
* @return will return false if trying to add a reference edge which creates a cycle in the assembly graph
*/
public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) {
if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); }
final int numVertexBefore = vertexSet().size();
final DeBruijnVertex v1 = new DeBruijnVertex( kmer1, kmer1.length );
addVertex(v1);
final DeBruijnVertex v2 = new DeBruijnVertex( kmer2, kmer2.length );
addVertex(v2);
if( isRef && vertexSet().size() == numVertexBefore ) { return false; }
final DeBruijnEdge targetEdge = getEdge(v1, v2);
if ( targetEdge == null ) {
addEdge(v1, v2, new DeBruijnEdge( isRef, multiplicity ));
} else {
if( isRef ) {
targetEdge.setIsRef( true );
}
targetEdge.setMultiplicity(targetEdge.getMultiplicity() + multiplicity);
}
return true;
}
/**
* Convenience function to add multiple vertices to the graph at once
* @param vertices one or more vertices to add
*/
public void addVertices(final DeBruijnVertex ... vertices) {
for ( final DeBruijnVertex v : vertices )
public void addVertices(final T ... vertices) {
for ( final T v : vertices )
addVertex(v);
}
/**
* Get the set of vertices connected by outgoing edges of V
* @param v a non-null vertex
* @return a set of vertices connected by outgoing edges from v
*/
public Set<T> outgoingVerticesOf(final T v) {
final Set<T> s = new HashSet<T>();
for ( final BaseEdge e : outgoingEdgesOf(v) ) {
s.add(getEdgeTarget(e));
}
return s;
}
/**
* Get the set of vertices connected to v by incoming edges
* @param v a non-null vertex
* @return a set of vertices {X} connected X -> v
*/
public Set<T> incomingVerticesOf(final T v) {
final Set<T> s = new HashSet<T>();
for ( final BaseEdge e : incomingEdgesOf(v) ) {
s.add(getEdgeSource(e));
}
return s;
}
/**
* Print out the graph in the dot language for visualization
* @param destination File to write to
@ -403,11 +353,12 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
}
}
// TODO -- generalize to support both types of graphs. Need some kind of display string function
public void printGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) {
if ( writeHeader )
graphWriter.println("digraph assemblyGraphs {");
for( final DeBruijnEdge edge : edgeSet() ) {
for( final BaseEdge edge : edgeSet() ) {
// if( edge.getMultiplicity() > PRUNE_FACTOR ) {
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];");
// }
@ -417,11 +368,114 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex,
//if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
}
for( final DeBruijnVertex v : vertexSet() ) {
for( final T v : vertexSet() ) {
graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + "\",shape=box]");
}
if ( writeHeader )
graphWriter.println("}");
}
protected void cleanNonRefPaths() {
if( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) {
return;
}
// Remove non-ref edges connected before and after the reference path
final Set<BaseEdge> edgesToCheck = new HashSet<BaseEdge>();
edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex()));
while( !edgesToCheck.isEmpty() ) {
final BaseEdge e = edgesToCheck.iterator().next();
if( !e.isRef() ) {
edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) );
removeEdge(e);
}
edgesToCheck.remove(e);
}
edgesToCheck.addAll(outgoingEdgesOf(getReferenceSinkVertex()));
while( !edgesToCheck.isEmpty() ) {
final BaseEdge e = edgesToCheck.iterator().next();
if( !e.isRef() ) {
edgesToCheck.addAll( outgoingEdgesOf(getEdgeTarget(e)) );
removeEdge(e);
}
edgesToCheck.remove(e);
}
// Run through the graph and clean up singular orphaned nodes
final List<T> verticesToRemove = new LinkedList<T>();
for( final T v : vertexSet() ) {
if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) {
verticesToRemove.add(v);
}
}
removeAllVertices(verticesToRemove);
}
protected void pruneGraph( final int pruneFactor ) {
final List<BaseEdge> edgesToRemove = new ArrayList<BaseEdge>();
for( final BaseEdge e : edgeSet() ) {
if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
edgesToRemove.add(e);
}
}
removeAllEdges(edgesToRemove);
// Run through the graph and clean up singular orphaned nodes
final List<T> verticesToRemove = new ArrayList<T>();
for( final T v : vertexSet() ) {
if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) {
verticesToRemove.add(v);
}
}
removeAllVertices(verticesToRemove);
}
public void removeVerticesNotConnectedToRef() {
final HashSet<T> toRemove = new HashSet<T>(vertexSet());
final HashSet<T> visited = new HashSet<T>();
final LinkedList<T> toVisit = new LinkedList<T>();
final T refV = getReferenceSourceVertex();
if ( refV != null ) {
toVisit.add(refV);
while ( ! toVisit.isEmpty() ) {
final T v = toVisit.pop();
if ( ! visited.contains(v) ) {
toRemove.remove(v);
visited.add(v);
for ( final T prev : incomingVerticesOf(v) ) toVisit.add(prev);
for ( final T next : outgoingVerticesOf(v) ) toVisit.add(next);
}
}
// for ( final T remove : toRemove )
// logger.info("Cleaning up nodes not attached to any reference node: " + remove.toString());
removeAllVertices(toRemove);
}
}
public static <T extends BaseVertex> boolean graphEquals(final BaseGraph<T> g1, BaseGraph<T> g2) {
if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) {
return false;
}
for( BaseEdge e1 : g1.edgeSet() ) {
boolean found = false;
for( BaseEdge e2 : g2.edgeSet() ) {
if( e1.equals(g1, e2, g2) ) { found = true; break; }
}
if( !found ) { return false; }
}
for( BaseEdge e2 : g2.edgeSet() ) {
boolean found = false;
for( BaseEdge e1 : g1.edgeSet() ) {
if( e2.equals(g2, e1, g1) ) { found = true; break; }
}
if( !found ) { return false; }
}
return true;
}
}

View File

@ -0,0 +1,148 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import java.util.Arrays;
/**
* A graph vertex that holds some sequence information
*
* @author: depristo
* @since 03/2013
*/
public class BaseVertex {
final byte[] sequence;
/**
* Create a new sequence vertex with sequence
* @param sequence a non-null, non-empty sequence of bases contained in this vertex
*/
public BaseVertex(final byte[] sequence) {
if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null");
if ( sequence.length == 0 ) throw new IllegalArgumentException("Sequence cannot be empty");
// TODO -- should we really be cloning here?
this.sequence = sequence.clone();
}
/**
* Get the length of this sequence
* @return a positive integer >= 1
*/
public int length() {
return sequence.length;
}
/**
* For testing purposes only -- low performance
* @param sequence
*/
protected BaseVertex(final String sequence) {
this(sequence.getBytes());
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
BaseVertex that = (BaseVertex) o;
if (!Arrays.equals(sequence, that.sequence)) return false;
return true;
}
@Override
public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect
return Arrays.hashCode(sequence);
}
@Override
public String toString() {
return getSequenceString();
}
/**
* Get the sequence of bases contained in this vertex
*
* Do not modify these bytes in any way!
*
* @return a non-null pointer to the bases contained in this vertex
*/
@Ensures("result != null")
public byte[] getSequence() {
// TODO -- why is this cloning? It's likely extremely expensive
return sequence.clone();
}
/**
* Get a string representation of the bases in this vertex
* @return a non-null String
*/
@Ensures("result != null")
public String getSequenceString() {
return new String(sequence);
}
/**
* Get the sequence unique to this vertex
*
* This function may not return the entire sequence stored in the vertex, as kmer graphs
* really only provide 1 base of additional sequence (the last base of the kmer).
*
* The base implementation simply returns the sequence.
*
* @param source is this vertex a source vertex (i.e., no in nodes) in the graph
* @return a byte[] of the sequence added by this vertex to the overall sequence
*/
public byte[] getAdditionalSequence(final boolean source) {
return getSequence();
}
}

View File

@ -65,8 +65,6 @@ import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.*;
@ -81,7 +79,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
private static final byte MIN_QUALITY = (byte) 16;
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16;
private static final int GRAPH_KMER_STEP = 6;
// Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode
@ -91,22 +89,34 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0;
private final boolean debug;
private final int onlyBuildKmerGraphOfThisSite = -1; // 35;
private final boolean debugGraphTransformations;
private final PrintStream graphWriter;
private final List<DeBruijnAssemblyGraph> graphs = new ArrayList<DeBruijnAssemblyGraph>();
private final int minKmer;
private final int maxHaplotypesToConsider;
private final byte minBaseQualityToUseInAssembly;
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
private int PRUNE_FACTOR = 2;
public DeBruijnAssembler(final boolean debug, final boolean debugGraphTransformations, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) {
protected DeBruijnAssembler() {
this(false, -1, null, 11, 1000, DEFAULT_MIN_BASE_QUALITY_TO_USE);
}
public DeBruijnAssembler(final boolean debug,
final int debugGraphTransformations,
final PrintStream graphWriter,
final int minKmer,
final int maxHaplotypesToConsider,
final byte minBaseQualityToUseInAssembly) {
super();
this.debug = debug;
this.debugGraphTransformations = debugGraphTransformations;
this.debugGraphTransformations = debugGraphTransformations > 0;
this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations;
this.graphWriter = graphWriter;
this.minKmer = minKmer;
this.maxHaplotypesToConsider = maxHaplotypesToConsider;
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
}
/**
@ -130,199 +140,73 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
this.PRUNE_FACTOR = PRUNE_FACTOR;
// create the graphs
createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
final List<SeqGraph> graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
// print the graphs if the appropriate debug option has been turned on
if( graphWriter != null ) {
printGraphs();
printGraphs(graphs);
}
// find the best paths in the graphs and return them as haplotypes
return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
}
@Requires({"reads != null", "refHaplotype != null"})
protected void createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
graphs.clear();
protected List<SeqGraph> createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
final List<SeqGraph> graphs = new LinkedList<SeqGraph>();
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
if( maxKmer < minKmer) { return; } // Reads are too small for assembly so don't try to create any assembly graphs
if( maxKmer < minKmer) {
// Reads are too small for assembly so don't try to create any assembly graphs
return Collections.emptyList();
}
// create the graph for each possible kmer
for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) {
if ( onlyBuildKmerGraphOfThisSite != -1 && kmer != onlyBuildKmerGraphOfThisSite )
if ( debugGraphTransformations && kmer > onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms)
continue;
if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug);
DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug);
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR);
graph = graph.errorCorrect();
if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR);
cleanNonRefPaths(graph);
mergeNodes(graph);
if ( debugGraphTransformations ) graph.printGraph(new File("merged.dot"), PRUNE_FACTOR);
pruneGraph(graph, PRUNE_FACTOR);
if ( debugGraphTransformations ) graph.printGraph(new File("pruned.dot"), PRUNE_FACTOR);
mergeNodes(graph);
if ( debugGraphTransformations ) graph.printGraph(new File("merged2.dot"), PRUNE_FACTOR);
if( graph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference
sanityCheckReferenceGraph(graph, refHaplotype);
graphs.add(graph);
graph.cleanNonRefPaths();
final SeqGraph seqGraph = toSeqGraph(graph);
if( seqGraph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference
sanityCheckReferenceGraph(seqGraph, refHaplotype);
graphs.add(seqGraph);
if ( debugGraphTransformations ) // we only want to use one graph size
break;
}
}
}
return graphs;
}
@Requires({"graph != null"})
protected static void mergeNodes( final DeBruijnAssemblyGraph graph ) {
boolean foundNodesToMerge = true;
while( foundNodesToMerge ) {
foundNodesToMerge = false;
for( final DeBruijnEdge e : graph.edgeSet() ) {
final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e);
final DeBruijnVertex incomingVertex = graph.getEdgeSource(e);
if( !outgoingVertex.equals(incomingVertex) && graph.outDegreeOf(incomingVertex) == 1 && graph.inDegreeOf(outgoingVertex) == 1 &&
graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) {
final Set<DeBruijnEdge> outEdges = graph.outgoingEdgesOf(outgoingVertex);
final Set<DeBruijnEdge> inEdges = graph.incomingEdgesOf(incomingVertex);
if( inEdges.size() == 1 && outEdges.size() == 1 ) {
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
} else if( inEdges.size() == 1 ) {
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
} else if( outEdges.size() == 1 ) {
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
}
final DeBruijnVertex addedVertex = new DeBruijnVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSuffix()), outgoingVertex.kmer );
graph.addVertex(addedVertex);
for( final DeBruijnEdge edge : outEdges ) {
graph.addEdge(addedVertex, graph.getEdgeTarget(edge), new DeBruijnEdge(edge.isRef(), edge.getMultiplicity()));
}
for( final DeBruijnEdge edge : inEdges ) {
graph.addEdge(graph.getEdgeSource(edge), addedVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity()));
}
graph.removeVertex( incomingVertex );
graph.removeVertex( outgoingVertex );
foundNodesToMerge = true;
break;
}
}
}
private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) {
final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR);
seqGraph.pruneGraph(PRUNE_FACTOR);
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR);
seqGraph.mergeNodes();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.preclean.dot"), PRUNE_FACTOR);
seqGraph.removeVerticesNotConnectedToRef();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), PRUNE_FACTOR);
seqGraph.mergeBranchingNodes();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.simplified.dot"), PRUNE_FACTOR);
seqGraph.mergeNodes();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.6.simplified.merged.dot"), PRUNE_FACTOR);
return seqGraph;
}
//
// X -> ABC -> Y
// -> aBC -> Y
//
// becomes
//
// X -> A -> BCY
// -> a -> BCY
//
// @Requires({"graph != null"})
// protected static void simplifyMergedGraph(final DeBruijnAssemblyGraph graph) {
// boolean foundNodesToMerge = true;
// while( foundNodesToMerge ) {
// foundNodesToMerge = false;
//
// for( final DeBruijnVertex v : graph.vertexSet() ) {
// if ( isRootOfComplexDiamond(v) ) {
// foundNodesToMerge = simplifyComplexDiamond(graph, v);
// if ( foundNodesToMerge )
// break;
// }
// }
// }
// }
//
// private static boolean simplifyComplexDiamond(final DeBruijnAssemblyGraph graph, final DeBruijnVertex root) {
// final Set<DeBruijnEdge> outEdges = graph.outgoingEdgesOf(root);
// final DeBruijnVertex diamondBottom = graph.getEdge(graph.getEdgeTarget(outEdges.iterator().next());
// // all of the edges point to the same sink, so it's time to merge
// final byte[] commonSuffix = commonSuffixOfEdgeTargets(outEdges, targetSink);
// if ( commonSuffix != null ) {
// final DeBruijnVertex suffixVertex = new DeBruijnVertex(commonSuffix, graph.getKmerSize());
// graph.addVertex(suffixVertex);
// graph.addEdge(suffixVertex, targetSink);
//
// for( final DeBruijnEdge edge : outEdges ) {
// final DeBruijnVertex target = graph.getEdgeTarget(edge);
// final DeBruijnVertex prefix = target.withoutSuffix(commonSuffix);
// graph.addEdge(prefix, suffixVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity()));
// graph.removeVertex(graph.getEdgeTarget(edge));
// graph.removeAllEdges(root, target);
// graph.removeAllEdges(target, targetSink);
// }
//
// graph.removeAllEdges(outEdges);
// graph.removeVertex(targetSink);
//
// return true;
// } else {
// return false;
// }
// }
protected static void cleanNonRefPaths( final DeBruijnAssemblyGraph graph ) {
if( graph.getReferenceSourceVertex() == null || graph.getReferenceSinkVertex() == null ) {
return;
}
// Remove non-ref edges connected before and after the reference path
final Set<DeBruijnEdge> edgesToCheck = new HashSet<DeBruijnEdge>();
edgesToCheck.addAll(graph.incomingEdgesOf(graph.getReferenceSourceVertex()));
while( !edgesToCheck.isEmpty() ) {
final DeBruijnEdge e = edgesToCheck.iterator().next();
if( !e.isRef() ) {
edgesToCheck.addAll( graph.incomingEdgesOf(graph.getEdgeSource(e)) );
graph.removeEdge(e);
}
edgesToCheck.remove(e);
}
edgesToCheck.addAll(graph.outgoingEdgesOf(graph.getReferenceSinkVertex()));
while( !edgesToCheck.isEmpty() ) {
final DeBruijnEdge e = edgesToCheck.iterator().next();
if( !e.isRef() ) {
edgesToCheck.addAll( graph.outgoingEdgesOf(graph.getEdgeTarget(e)) );
graph.removeEdge(e);
}
edgesToCheck.remove(e);
}
// Run through the graph and clean up singular orphaned nodes
final List<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
for( final DeBruijnVertex v : graph.vertexSet() ) {
if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) {
verticesToRemove.add(v);
}
}
graph.removeAllVertices(verticesToRemove);
}
protected static void pruneGraph( final DeBruijnAssemblyGraph graph, final int pruneFactor ) {
final List<DeBruijnEdge> edgesToRemove = new ArrayList<DeBruijnEdge>();
for( final DeBruijnEdge e : graph.edgeSet() ) {
if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
edgesToRemove.add(e);
}
}
graph.removeAllEdges(edgesToRemove);
// Run through the graph and clean up singular orphaned nodes
final List<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
for( final DeBruijnVertex v : graph.vertexSet() ) {
if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) {
verticesToRemove.add(v);
}
}
graph.removeAllVertices(verticesToRemove);
}
protected static void sanityCheckReferenceGraph(final DeBruijnAssemblyGraph graph, final Haplotype refHaplotype) {
protected <T extends BaseVertex> void sanityCheckReferenceGraph(final BaseGraph<T> graph, final Haplotype refHaplotype) {
if( graph.getReferenceSourceVertex() == null ) {
throw new IllegalStateException("All reference graphs must have a reference source vertex.");
}
@ -338,9 +222,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
}
@Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"})
protected static DeBruijnAssemblyGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(KMER_LENGTH);
final DeBruijnGraph graph = new DeBruijnGraph(KMER_LENGTH);
// First pull kmers from the reference haplotype and add them to the graph
//logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString());
@ -370,7 +254,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
// if the qualities of all the bases in the kmers are high enough
boolean badKmer = false;
for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) {
if( qualities[jjj] < MIN_QUALITY ) {
if( qualities[jjj] < minBaseQualityToUseInAssembly ) {
badKmer = true;
break;
}
@ -397,11 +281,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
return graph;
}
protected void printGraphs() {
protected void printGraphs(final List<SeqGraph> graphs) {
final int writeFirstGraphWithSizeSmallerThan = 50;
graphWriter.println("digraph assemblyGraphs {");
for( final DeBruijnAssemblyGraph graph : graphs ) {
for( final SeqGraph graph : graphs ) {
if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) {
logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize());
continue;
@ -418,7 +302,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
@Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
@Ensures({"result.contains(refHaplotype)"})
private List<Haplotype> findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
private List<Haplotype> findBestPaths( final List<SeqGraph> graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
// TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm
@ -440,8 +324,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
}
}
for( final DeBruijnAssemblyGraph graph : graphs ) {
for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
for( final SeqGraph graph : graphs ) {
for ( final KBestPaths.Path path : new KBestPaths<SeqVertex>().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
Haplotype h = new Haplotype( path.getBases() );
if( !returnHaplotypes.contains(h) ) {
final Cigar cigar = path.calculateCigar();
@ -466,6 +350,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
h.setScore(path.getScore());
returnHaplotypes.add(h);
if ( debug )
logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
// for GGA mode, add the desired allele into the haplotype if it isn't already present
if( !activeAllelesToGenotype.isEmpty() ) {
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
@ -599,7 +486,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
* @return the left-aligned cigar
*/
@Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"})
protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
protected Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
final Cigar cigarToReturn = new Cigar();
Cigar cigarToAlign = new Cigar();
for (int i = 0; i < cigar.numCigarElements(); i++) {

View File

@ -0,0 +1,179 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* A DeBruijn kmer graph
*
* User: rpoplin
* Date: 2/6/13
*/
public class DeBruijnGraph extends BaseGraph<DeBruijnVertex> {
/**
* Create an empty DeBruijnGraph with default kmer size
*/
public DeBruijnGraph() {
super();
}
/**
* Create an empty DeBruijnGraph with kmer size
* @param kmerSize kmer size, must be >= 1
*/
public DeBruijnGraph(int kmerSize) {
super(kmerSize);
}
/**
* Pull kmers out of the given long sequence and throw them on in the graph
* @param sequence byte array holding the sequence with which to build the assembly graph
* @param KMER_LENGTH the desired kmer length to use
* @param isRef if true the kmers added to the graph will have reference edges linking them
*/
public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) {
if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); }
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1);
}
}
/**
* Error correct the kmers in this graph, returning a new graph built from those error corrected kmers
* @return a freshly allocated graph
*/
protected DeBruijnGraph errorCorrect() {
final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables
for( final BaseEdge e : edgeSet() ) {
for ( final byte[] kmer : Arrays.asList(getEdgeSource(e).getSequence(), getEdgeTarget(e).getSequence())) {
// TODO -- need a cleaner way to deal with the ref weight
corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity());
}
}
corrector.computeErrorCorrectionMap();
final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize());
for( final BaseEdge e : edgeSet() ) {
final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence());
final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence());
if ( source != null && target != null ) {
correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity());
}
}
return correctedGraph;
}
/**
* Add edge to assembly graph connecting the two kmers
* @param kmer1 the source kmer for the edge
* @param kmer2 the target kmer for the edge
* @param isRef true if the added edge is a reference edge
* @return will return false if trying to add a reference edge which creates a cycle in the assembly graph
*/
public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) {
if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); }
final int numVertexBefore = vertexSet().size();
final DeBruijnVertex v1 = new DeBruijnVertex( kmer1 );
addVertex(v1);
final DeBruijnVertex v2 = new DeBruijnVertex( kmer2 );
addVertex(v2);
if( isRef && vertexSet().size() == numVertexBefore ) { return false; }
final BaseEdge targetEdge = getEdge(v1, v2);
if ( targetEdge == null ) {
addEdge(v1, v2, new BaseEdge( isRef, multiplicity ));
} else {
if( isRef ) {
targetEdge.setIsRef( true );
}
targetEdge.setMultiplicity(targetEdge.getMultiplicity() + multiplicity);
}
return true;
}
/**
* Convert this kmer graph to a simple sequence graph.
*
* Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
* graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
*
* @return a newly allocated SequenceGraph
*/
@Ensures({"result != null"})
protected SeqGraph convertToSequenceGraph() {
final SeqGraph seqGraph = new SeqGraph(getKmerSize());
final Map<DeBruijnVertex, SeqVertex> vertexMap = new HashMap<DeBruijnVertex, SeqVertex>();
// create all of the equivalent seq graph vertices
for ( final DeBruijnVertex dv : vertexSet() ) {
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
vertexMap.put(dv, sv);
seqGraph.addVertex(sv);
}
// walk through the nodes and connect them to their equivalent seq vertices
for( final BaseEdge e : edgeSet() ) {
final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
seqGraph.addEdge(seqInV, seqOutV, e);
}
return seqGraph;
}
}

View File

@ -52,59 +52,50 @@ import com.google.java.contract.Invariant;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* simple node class for storing kmer sequences
*
* User: ebanks
* Date: Mar 23, 2011
*/
// simple node class for storing kmer sequences
@Invariant("kmer > 0")
public class DeBruijnVertex {
protected final byte[] sequence;
public final int kmer;
public DeBruijnVertex( final byte[] sequence, final int kmer ) {
this.sequence = sequence.clone();
this.kmer = kmer;
}
protected DeBruijnVertex( final String sequence, final int kmer ) {
this(sequence.getBytes(), kmer);
public class DeBruijnVertex extends BaseVertex {
public DeBruijnVertex( final byte[] sequence ) {
super(sequence);
}
/**
* For testing purposes only
* @param sequence
*/
protected DeBruijnVertex( final String sequence ) {
this(sequence.getBytes(), sequence.length());
this(sequence.getBytes());
}
/**
* Get the kmer size for this DeBruijnVertex
* @return integer >= 1
*/
@Ensures("result >= 1")
public int getKmer() {
return kmer;
return sequence.length;
}
@Override
public boolean equals( Object v ) {
return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence);
}
@Override
public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect
return Arrays.hashCode(sequence);
}
public String toString() {
return new String(sequence);
}
/**
* Get the string representation of the suffix of this DeBruijnVertex
* @return a non-null non-empty string
*/
@Ensures({"result != null", "result.length() >= 1"})
public String getSuffixString() {
return new String(getSuffix());
}
@Ensures("result != null")
public byte[] getSequence() {
return sequence.clone();
// TODO this could be replaced with byte as the suffix is guarenteed to be exactly 1 base
public byte[] getSuffix() {
return Arrays.copyOfRange( sequence, getKmer() - 1, sequence.length );
}
@Ensures("result != null")
public byte[] getSuffix() {
return Arrays.copyOfRange( sequence, kmer - 1, sequence.length );
@Override
public byte[] getAdditionalSequence(boolean source) {
return source ? super.getAdditionalSequence(source) : getSuffix();
}
}

View File

@ -284,8 +284,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
protected boolean DEBUG;
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler", required = false)
protected boolean debugGraphTransformations = false;
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false)
protected int debugGraphTransformations = -1;
@Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false)
protected boolean useLowQualityBasesForAssembly = false;
// the UG engines
private UnifiedGenotyperEngine UG_engine = null;
@ -389,7 +392,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
}
assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider );
final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE;
assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider, minBaseQualityToUseInAssembly );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
@ -610,7 +614,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
for( final GATKSAMRecord myRead : finalizedReadList ) {
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
GATKSAMRecord clippedRead = useLowQualityBasesForAssembly ? postAdapterRead : ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't

View File

@ -52,13 +52,8 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.Serializable;
import java.util.*;
@ -70,28 +65,27 @@ import java.util.*;
*/
// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph.
// This is different from most graph traversals because we want to test paths from any source node to any sink node.
public class KBestPaths {
public class KBestPaths<T extends BaseVertex> {
// static access only
protected KBestPaths() { }
public KBestPaths() { }
private static int MAX_PATHS_TO_HOLD = 100;
protected static class MyInt { public int val = 0; }
// class to keep track of paths
protected static class Path {
protected static class Path<T extends BaseVertex> {
// the last vertex seen in the path
private final DeBruijnVertex lastVertex;
private final T lastVertex;
// the list of edges comprising the path
private final List<DeBruijnEdge> edges;
private final List<BaseEdge> edges;
// the scores for the path
private final int totalScore;
// the graph from which this path originated
private final DeBruijnAssemblyGraph graph;
private final BaseGraph<T> graph;
// used in the bubble state machine to apply Smith-Waterman to the bubble sequence
// these values were chosen via optimization against the NA12878 knowledge base
@ -101,19 +95,19 @@ public class KBestPaths {
private static final double SW_GAP_EXTEND = -1.1;
private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes();
public Path( final DeBruijnVertex initialVertex, final DeBruijnAssemblyGraph graph ) {
public Path( final T initialVertex, final BaseGraph<T> graph ) {
lastVertex = initialVertex;
edges = new ArrayList<DeBruijnEdge>(0);
edges = new ArrayList<BaseEdge>(0);
totalScore = 0;
this.graph = graph;
}
public Path( final Path p, final DeBruijnEdge edge ) {
public Path( final Path<T> p, final BaseEdge edge ) {
if( !p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); }
graph = p.graph;
lastVertex = p.graph.getEdgeTarget(edge);
edges = new ArrayList<DeBruijnEdge>(p.edges);
edges = new ArrayList<BaseEdge>(p.edges);
edges.add(edge);
totalScore = p.totalScore + edge.getMultiplicity();
}
@ -123,10 +117,10 @@ public class KBestPaths {
* @param edge the given edge to test
* @return true if the edge is found in this path
*/
public boolean containsEdge( final DeBruijnEdge edge ) {
public boolean containsEdge( final BaseEdge edge ) {
if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); }
for( final DeBruijnEdge e : edges ) {
for( final BaseEdge e : edges ) {
if( e.equals(graph, edge) ) {
return true;
}
@ -140,11 +134,11 @@ public class KBestPaths {
* @param edge the given edge to test
* @return number of times this edge appears in the path
*/
public int numInPath( final DeBruijnEdge edge ) {
public int numInPath( final BaseEdge edge ) {
if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); }
int numInPath = 0;
for( final DeBruijnEdge e : edges ) {
for( final BaseEdge e : edges ) {
if( e.equals(graph, edge) ) {
numInPath++;
}
@ -153,22 +147,11 @@ public class KBestPaths {
return numInPath;
}
/**
* Does this path contain a reference edge?
* @return true if the path contains a reference edge
*/
public boolean containsRefEdge() {
for( final DeBruijnEdge e : edges ) {
if( e.isRef() ) { return true; }
}
return false;
}
public List<DeBruijnEdge> getEdges() { return edges; }
public List<BaseEdge> getEdges() { return edges; }
public int getScore() { return totalScore; }
public DeBruijnVertex getLastVertexInPath() { return lastVertex; }
public T getLastVertexInPath() { return lastVertex; }
/**
* The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes
@ -179,7 +162,7 @@ public class KBestPaths {
if( edges.size() == 0 ) { return graph.getAdditionalSequence(lastVertex); }
byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edges.get(0)));
for( final DeBruijnEdge e : edges ) {
for( final BaseEdge e : edges ) {
bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e)));
}
return bases;
@ -201,9 +184,9 @@ public class KBestPaths {
}
// reset the bubble state machine
final BubbleStateMachine bsm = new BubbleStateMachine(cigar);
final BubbleStateMachine<T> bsm = new BubbleStateMachine<T>(cigar);
for( final DeBruijnEdge e : edges ) {
for( final BaseEdge e : edges ) {
if( e.equals(graph, edges.get(0)) ) {
advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null );
}
@ -231,7 +214,7 @@ public class KBestPaths {
* @param e the edge which generated this node in the path
*/
@Requires({"bsm != null", "graph != null", "node != null"})
private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final DeBruijnVertex node, final DeBruijnEdge e ) {
private void advanceBubbleStateMachine( final BubbleStateMachine<T> bsm, final T node, final BaseEdge e ) {
if( graph.isReferenceNode( node ) ) {
if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else
if( e !=null && !e.isRef() ) {
@ -283,7 +266,7 @@ public class KBestPaths {
*/
@Requires({"graph != null"})
@Ensures({"result != null"})
private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex ) {
private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) {
final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null);
final Cigar returnCigar = new Cigar();
@ -328,10 +311,10 @@ public class KBestPaths {
}
// class to keep track of the bubble state machine
protected static class BubbleStateMachine {
protected static class BubbleStateMachine<T extends BaseVertex> {
public boolean inBubble = false;
public byte[] bubbleBytes = null;
public DeBruijnVertex lastSeenReferenceNode = null;
public T lastSeenReferenceNode = null;
public Cigar cigar = null;
public BubbleStateMachine( final Cigar initialCigar ) {
@ -358,14 +341,14 @@ public class KBestPaths {
* @return a list with at most k top-scoring paths from the graph
*/
@Ensures({"result != null", "result.size() <= k"})
public static List<Path> getKBestPaths( final DeBruijnAssemblyGraph graph, final int k ) {
public List<Path> getKBestPaths( final BaseGraph<T> graph, final int k ) {
if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); }
if( k > MAX_PATHS_TO_HOLD/2 ) { throw new IllegalArgumentException("Asked for more paths than internal parameters allow for."); }
final ArrayList<Path> bestPaths = new ArrayList<Path>();
// run a DFS for best paths
for( final DeBruijnVertex v : graph.vertexSet() ) {
for( final T v : graph.vertexSet() ) {
if( graph.inDegreeOf(v) == 0 ) {
findBestPaths(new Path(v, graph), bestPaths);
}
@ -376,31 +359,28 @@ public class KBestPaths {
return bestPaths.subList(0, Math.min(k, bestPaths.size()));
}
private static void findBestPaths( final Path path, final List<Path> bestPaths ) {
private void findBestPaths( final Path path, final List<Path> bestPaths ) {
findBestPaths(path, bestPaths, new MyInt());
}
private static void findBestPaths( final Path path, final List<Path> bestPaths, final MyInt n ) {
private void findBestPaths( final Path path, final List<Path> bestPaths, final MyInt n ) {
// did we hit the end of a path?
if ( allOutgoingEdgesHaveBeenVisited(path) ) {
if( path.containsRefEdge() ) {
if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) {
// clean out some low scoring paths
Collections.sort(bestPaths, new PathComparatorTotalScore() );
for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20
}
bestPaths.add(path);
if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) {
// clean out some low scoring paths
Collections.sort(bestPaths, new PathComparatorTotalScore() );
for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20
}
bestPaths.add(path);
} else if( n.val > 10000) {
// do nothing, just return
} else {
// recursively run DFS
final ArrayList<DeBruijnEdge> edgeArrayList = new ArrayList<DeBruijnEdge>();
final ArrayList<BaseEdge> edgeArrayList = new ArrayList<BaseEdge>();
edgeArrayList.addAll(path.graph.outgoingEdgesOf(path.lastVertex));
Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator());
Collections.reverse(edgeArrayList);
for ( final DeBruijnEdge edge : edgeArrayList ) {
Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator());
for ( final BaseEdge edge : edgeArrayList ) {
// make sure the edge is not already in the path
if ( path.containsEdge(edge) )
continue;
@ -416,8 +396,8 @@ public class KBestPaths {
* @param path the path to test
* @return true if all the outgoing edges at the end of this path have already been visited
*/
private static boolean allOutgoingEdgesHaveBeenVisited( final Path path ) {
for( final DeBruijnEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) {
private boolean allOutgoingEdgesHaveBeenVisited( final Path<T> path ) {
for( final BaseEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) {
if( !path.containsEdge(edge) ) { // TODO -- investigate allowing numInPath < 2 to allow cycles
return false;
}

View File

@ -226,28 +226,16 @@ public class KMerErrorCorrector {
@Override
public String toString() {
final StringBuilder b = new StringBuilder("KMerErrorCorrector{");
for ( Map.Entry<String, String> toCorrect : rawToErrorCorrectedMap.entrySet() ) {
final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue());
if ( correcting )
b.append(String.format("%n\t%s / %d -> %s / %d [correcting? %b]",
toCorrect.getKey(), getCounts(toCorrect.getKey()),
toCorrect.getValue(), getCounts(toCorrect.getValue()),
correcting));
if ( rawToErrorCorrectedMap == null ) {
b.append("counting ").append(countsByKMer.size()).append(" distinct kmers");
} else {
for ( Map.Entry<String, String> toCorrect : rawToErrorCorrectedMap.entrySet() ) {
final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue());
if ( correcting )
b.append(String.format("%n\tCorrecting %s -> %s", toCorrect.getKey(), toCorrect.getValue()));
}
}
b.append("\n}");
return b.toString();
}
/**
* Get a simple count estimate for printing for kmer
* @param kmer the kmer
* @return an integer count for kmer
*/
private int getCounts(final String kmer) {
if ( kmer == null ) return 0;
final Integer count = countsByKMer == null ? -1 : countsByKMer.get(kmer);
if ( count == null )
throw new IllegalArgumentException("kmer not found in counts -- bug " + kmer);
return count;
}
}

View File

@ -0,0 +1,280 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import java.util.*;
/**
* A graph that contains base sequence at each node
*
* @author: depristo
* @since 03/2013
*/
public class SeqGraph extends BaseGraph<SeqVertex> {
/**
* Construct an empty SeqGraph
*/
public SeqGraph() {
super();
}
/**
* Construct an empty SeqGraph where we'll add nodes based on a kmer size of kmer
*
* The kmer size is purely information. It is useful when converting a Debruijn graph -> SeqGraph
* for us to track the kmer used to make the transformation.
*
* @param kmer kmer
*/
public SeqGraph(final int kmer) {
super(kmer);
}
protected void mergeNodes() {
zipLinearChains();
}
protected void zipLinearChains() {
boolean foundNodesToMerge = true;
while( foundNodesToMerge ) {
foundNodesToMerge = false;
for( final BaseEdge e : edgeSet() ) {
final SeqVertex outgoingVertex = getEdgeTarget(e);
final SeqVertex incomingVertex = getEdgeSource(e);
if( !outgoingVertex.equals(incomingVertex)
&& outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1
&& isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) {
final Set<BaseEdge> outEdges = outgoingEdgesOf(outgoingVertex);
final Set<BaseEdge> inEdges = incomingEdgesOf(incomingVertex);
if( inEdges.size() == 1 && outEdges.size() == 1 ) {
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
} else if( inEdges.size() == 1 ) {
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
} else if( outEdges.size() == 1 ) {
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
}
final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) );
addVertex(addedVertex);
for( final BaseEdge edge : outEdges ) {
addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity()));
}
for( final BaseEdge edge : inEdges ) {
addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity()));
}
removeVertex(incomingVertex);
removeVertex(outgoingVertex);
foundNodesToMerge = true;
break;
}
}
}
}
//
// X -> ABC -> Y
// -> aBC -> Y
//
// becomes
//
// X -> A -> BCY
// -> a -> BCY
//
public void mergeBranchingNodes() {
boolean foundNodesToMerge = true;
while( foundNodesToMerge ) {
foundNodesToMerge = false;
for( final SeqVertex v : vertexSet() ) {
foundNodesToMerge = simplifyDiamond(v);
if ( foundNodesToMerge )
break;
}
}
}
/**
* A simple structure that looks like:
*
* v
* / | \ \
* m1 m2 m3 ... mn
* \ | / /
* b
*
* @param v
* @return
*/
protected boolean isRootOfDiamond(final SeqVertex v) {
final Set<BaseEdge> ve = outgoingEdgesOf(v);
if ( ve.size() <= 1 )
return false;
SeqVertex bottom = null;
for ( final BaseEdge e : ve ) {
final SeqVertex mi = getEdgeTarget(e);
// all nodes must have at least 1 connection
if ( outDegreeOf(mi) < 1 )
return false;
// can only have 1 incoming node, the root vertex
if ( inDegreeOf(mi) != 1 )
return false;
for ( final SeqVertex mt : outgoingVerticesOf(mi) ) {
if ( bottom == null )
bottom = mt;
else if ( ! bottom.equals(mt) )
return false;
}
}
return true;
}
private byte[] commonSuffixOfEdgeTargets(final Set<SeqVertex> middleVertices) {
final String[] kmers = new String[middleVertices.size()];
int i = 0;
for ( final SeqVertex v : middleVertices ) {
kmers[i++] = (StringUtils.reverse(v.getSequenceString()));
}
final String commonPrefix = StringUtils.getCommonPrefix(kmers);
return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes();
}
private SeqVertex getDiamondBottom(final SeqVertex top) {
final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next();
final SeqVertex middle = getEdgeTarget(topEdge);
final BaseEdge middleEdge = outgoingEdgesOf(middle).iterator().next();
return getEdgeTarget(middleEdge);
}
final Set<SeqVertex> getMiddleVertices(final SeqVertex top) {
final Set<SeqVertex> middles = new HashSet<SeqVertex>();
for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) {
middles.add(getEdgeTarget(topToMiddle));
}
return middles;
}
private boolean simplifyDiamond(final SeqVertex top) {
if ( ! isRootOfDiamond(top) )
return false;
final SeqVertex diamondBottom = getDiamondBottom(top);
final Set<SeqVertex> middleVertices = getMiddleVertices(top);
final List<SeqVertex> verticesToRemove = new LinkedList<SeqVertex>();
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
// all of the edges point to the same sink, so it's time to merge
final byte[] commonSuffix = commonSuffixOfEdgeTargets(middleVertices);
if ( commonSuffix != null ) {
boolean newBottomEdgeIsRef = false;
int newBottomEdgeMultiplicity = 0;
final SeqVertex newBottomV = new SeqVertex(commonSuffix);
addVertex(newBottomV);
for ( final SeqVertex middle : middleVertices ) {
boolean missingNodeEdgeIsRef = false;
int missingNodeMultiplicity = 0;
final SeqVertex withoutSuffix = middle.withoutSuffix(commonSuffix);
if ( withoutSuffix != null ) // this node is a deletion
addVertex(withoutSuffix);
// update all edges from top -> middle to be top -> without suffix
for( final BaseEdge topToMiddleEdge : getAllEdges(top, middle) ) {
edgesToRemove.add(topToMiddleEdge);
missingNodeMultiplicity += topToMiddleEdge.getMultiplicity();
missingNodeEdgeIsRef = missingNodeEdgeIsRef || topToMiddleEdge.isRef();
if ( withoutSuffix != null ) // this node is a deletion
addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge.isRef(), topToMiddleEdge.getMultiplicity()));
}
// reattached prefix to the new bottom V by updating all edges from middleV -> bottom
for ( final BaseEdge middleToBottomE : getAllEdges(middle, diamondBottom) ) {
missingNodeMultiplicity += middleToBottomE.getMultiplicity();
missingNodeEdgeIsRef = missingNodeEdgeIsRef || middleToBottomE.isRef();
if ( withoutSuffix != null ) // this node is a deletion
addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE.isRef(), middleToBottomE.getMultiplicity()));
edgesToRemove.add(middleToBottomE);
// update the info for the new bottom edge
newBottomEdgeIsRef = newBottomEdgeIsRef || middleToBottomE.isRef();
newBottomEdgeMultiplicity += middleToBottomE.getMultiplicity();
}
if ( withoutSuffix == null ) // add an edge from top to new bottom
addEdge(top, newBottomV, new BaseEdge(missingNodeEdgeIsRef, missingNodeMultiplicity));
verticesToRemove.add(middle);
}
addEdge(newBottomV, diamondBottom, new BaseEdge(newBottomEdgeIsRef, newBottomEdgeMultiplicity));
removeAllEdges(edgesToRemove);
removeAllVertices(verticesToRemove);
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,153 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.Utils;
import java.util.Arrays;
/**
* A graph vertex containing a sequence of bases and a unique ID that
* allows multiple distinct nodes in the graph to have the same sequence.
*
* This is essential when thinking about representing the actual sequence of a haplotype
* in a graph. There can be many parts of the sequence that have the same sequence, but
* are distinct elements in the graph because they have a different position in the graph. For example:
*
* A -> C -> G -> A -> T
*
* The two As are not the same, because they occur with different connections. In a kmer graph equals()
* is based on the sequence itself, as each distinct kmer can only be represented once. But the transformation
* of the kmer graph into a graph of base sequences, without their kmer prefixes, means that nodes that
* where once unique including their prefix can become equal after shedding the prefix. So we need to
* use some mechanism -- here a unique ID per node -- to separate nodes that have the same sequence
* but are distinct elements of the graph.
*
* @author: depristo
* @since 03/2013
*/
public class SeqVertex extends BaseVertex {
private static int idCounter = 0;
public final int id;
/**
* Create a new SeqVertex with sequence and the next available id
* @param sequence our base sequence
*/
public SeqVertex(final byte[] sequence) {
super(sequence);
this.id = idCounter++;
}
/**
* Create a new SeqVertex having bases of sequence.getBytes()
* @param sequence the string representation of our bases
*/
public SeqVertex(final String sequence) {
super(sequence);
this.id = idCounter++;
}
/**
* Create a copy of toCopy
* @param toCopy a SeqVertex to copy into this newly allocated one
*/
public SeqVertex(final SeqVertex toCopy) {
super(toCopy.sequence);
this.id = toCopy.id;
}
/**
* Get the unique ID for this SeqVertex
* @return a positive integer >= 0
*/
public int getId() {
return id;
}
@Override
public String toString() {
return "SeqVertex_id_" + id + "_seq_" + getSequenceString();
}
/**
* Two SeqVertex are equal only if their ids are equal
* @param o
* @return
*/
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SeqVertex seqVertex = (SeqVertex) o;
if (id != seqVertex.id) return false;
// note that we don't test for super equality here because the ids are unique
//if (!super.equals(o)) return false;
return true;
}
@Override
public int hashCode() {
return id;
}
/**
* Return a new SeqVertex derived from this one but not including the suffix bases
*
* @param suffix the suffix bases to remove from this vertex
* @return a newly allocated SeqVertex with appropriate prefix, or null if suffix removes all bases from this node
*/
@Requires("Utils.endsWith(sequence, suffix)")
public SeqVertex withoutSuffix(final byte[] suffix) {
final int prefixSize = sequence.length - suffix.length;
return prefixSize > 0 ? new SeqVertex(Arrays.copyOf(sequence, prefixSize)) : null;
}
}

View File

@ -0,0 +1,105 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class BaseEdgeUnitTest extends BaseTest {
@DataProvider(name = "EdgeCreationData")
public Object[][] makeMyDataProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
// this functionality can be adapted to provide input data for whatever you might want in your data
for ( final int multiplicity : Arrays.asList(1, 2, 3) ) {
for ( final boolean isRef : Arrays.asList(true, false) ) {
tests.add(new Object[]{isRef, multiplicity});
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "EdgeCreationData")
public void testBasic(final boolean isRef, final int mult) {
final BaseEdge e = new BaseEdge(isRef, mult);
Assert.assertEquals(e.isRef(), isRef);
Assert.assertEquals(e.getMultiplicity(), mult);
e.setIsRef(!isRef);
Assert.assertEquals(e.isRef(), !isRef);
e.setMultiplicity(mult + 1);
Assert.assertEquals(e.getMultiplicity(), mult + 1);
final BaseEdge copy = new BaseEdge(e);
Assert.assertEquals(copy.isRef(), e.isRef());
Assert.assertEquals(copy.getMultiplicity(), e.getMultiplicity());
}
@Test
public void testEdgeWeightComparator() {
final BaseEdge e10 = new BaseEdge(false, 10);
final BaseEdge e5 = new BaseEdge(true, 5);
final BaseEdge e2 = new BaseEdge(false, 2);
final BaseEdge e1 = new BaseEdge(false, 1);
final List<BaseEdge> edges = new ArrayList<BaseEdge>(Arrays.asList(e1, e2, e5, e10));
Collections.sort(edges, new BaseEdge.EdgeWeightComparator());
Assert.assertEquals(edges.get(0), e10);
Assert.assertEquals(edges.get(1), e5);
Assert.assertEquals(edges.get(2), e2);
Assert.assertEquals(edges.get(3), e1);
}
}

View File

@ -0,0 +1,192 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import scala.actors.threadpool.Arrays;
import java.io.File;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 3/15/13
* Time: 3:36 PM
* To change this template use File | Settings | File Templates.
*/
public class BaseGraphUnitTest extends BaseTest {
SeqGraph graph;
SeqVertex v1, v2, v3, v4, v5;
@BeforeMethod
public void setUp() throws Exception {
graph = new SeqGraph();
v1 = new SeqVertex("A");
v2 = new SeqVertex("C");
v3 = new SeqVertex("C");
v4 = new SeqVertex("C");
v5 = new SeqVertex("C");
graph.addVertices(v1, v2, v3, v4, v5);
graph.addEdge(v1, v2);
graph.addEdge(v2, v4);
graph.addEdge(v3, v2);
graph.addEdge(v2, v3);
graph.addEdge(v4, v5);
}
@Test
public void testIncomingAndOutgoingVertices() throws Exception {
assertVertexSetEquals(graph.outgoingVerticesOf(v1), v2);
assertVertexSetEquals(graph.incomingVerticesOf(v1));
assertVertexSetEquals(graph.outgoingVerticesOf(v2), v3, v4);
assertVertexSetEquals(graph.incomingVerticesOf(v2), v1, v3);
assertVertexSetEquals(graph.outgoingVerticesOf(v3), v2);
assertVertexSetEquals(graph.incomingVerticesOf(v3), v2);
assertVertexSetEquals(graph.outgoingVerticesOf(v4), v5);
assertVertexSetEquals(graph.incomingVerticesOf(v4), v2);
assertVertexSetEquals(graph.outgoingVerticesOf(v5));
assertVertexSetEquals(graph.incomingVerticesOf(v5), v4);
}
@Test
public void testPrintEmptyGraph() throws Exception {
final File tmp = File.createTempFile("tmp", "dot");
tmp.deleteOnExit();
new SeqGraph().printGraph(tmp, 10);
new DeBruijnGraph().printGraph(tmp, 10);
}
@Test
public void testComplexGraph() throws Exception {
final File tmp = File.createTempFile("tmp", "dot");
tmp.deleteOnExit();
graph.printGraph(tmp, 10);
}
private void assertVertexSetEquals(final Set<SeqVertex> actual, final SeqVertex ... expected) {
final Set<SeqVertex> expectedSet = expected == null ? Collections.<SeqVertex>emptySet() : new HashSet<SeqVertex>(Arrays.asList(expected));
Assert.assertEquals(actual, expectedSet);
}
@Test(enabled = true)
public void testPruneGraph() {
DeBruijnGraph graph = new DeBruijnGraph();
DeBruijnGraph expectedGraph = new DeBruijnGraph();
DeBruijnVertex v = new DeBruijnVertex("ATGG");
DeBruijnVertex v2 = new DeBruijnVertex("ATGGA");
DeBruijnVertex v3 = new DeBruijnVertex("ATGGT");
DeBruijnVertex v4 = new DeBruijnVertex("ATGGG");
DeBruijnVertex v5 = new DeBruijnVertex("ATGGC");
DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC");
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new BaseEdge(false, 1));
graph.addEdge(v2, v3, new BaseEdge(false, 3));
graph.addEdge(v3, v4, new BaseEdge(false, 5));
graph.addEdge(v4, v5, new BaseEdge(false, 3));
graph.addEdge(v5, v6, new BaseEdge(false, 2));
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3));
expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5));
expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3));
graph.pruneGraph(2);
Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
graph = new DeBruijnGraph();
expectedGraph = new DeBruijnGraph();
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new BaseEdge(true, 1));
graph.addEdge(v2, v3, new BaseEdge(false, 3));
graph.addEdge(v3, v4, new BaseEdge(false, 5));
graph.addEdge(v4, v5, new BaseEdge(false, 3));
expectedGraph.addVertex(v);
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v, v2, new BaseEdge(true, 1));
expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3));
expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5));
expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3));
graph.pruneGraph(2);
Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
}
}

View File

@ -0,0 +1,91 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;
public class BaseVertexUnitTest extends BaseTest {
@Test
public void testBasic() {
final byte[] bases = "ACT".getBytes();
final BaseVertex v = new BaseVertex(bases);
Assert.assertEquals(v.getSequence(), bases);
Assert.assertEquals(v.getAdditionalSequence(false), bases);
Assert.assertEquals(v.getAdditionalSequence(true), bases);
Assert.assertEquals(v.getSequenceString(), new String(bases));
Assert.assertEquals(v.toString(), v.getSequenceString());
Assert.assertEquals(v.length(), bases.length);
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testCreationNull() {
new BaseVertex((byte[])null);
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testCreationEmptySeq() {
new BaseVertex(new byte[0]);
}
@Test
public void testEqualsAndHashCode() {
final BaseVertex v1 = new BaseVertex("ACT".getBytes());
final BaseVertex v1_eq = new BaseVertex("ACT".getBytes());
final BaseVertex v2 = new BaseVertex("ACG".getBytes());
Assert.assertEquals(v1, v1);
Assert.assertEquals(v1.hashCode(), v1.hashCode());
Assert.assertEquals(v1, v1_eq);
Assert.assertEquals(v1.hashCode(), v1_eq.hashCode());
Assert.assertFalse(v1.equals(v2));
Assert.assertFalse(v2.equals(v1));
Assert.assertFalse(v2.hashCode() == v1.hashCode());
Assert.assertFalse(v2.equals(v1));
}
}

View File

@ -69,211 +69,12 @@ import java.util.*;
public class DeBruijnAssemblerUnitTest extends BaseTest {
private final static boolean DEBUG = true;
private class MergeNodesWithNoVariationTestProvider extends TestDataProvider {
public byte[] sequence;
public int KMER_LENGTH;
public MergeNodesWithNoVariationTestProvider(String seq, int kmer) {
super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq));
sequence = seq.getBytes();
KMER_LENGTH = kmer;
}
public DeBruijnAssemblyGraph expectedGraph() {
DeBruijnVertex v = new DeBruijnVertex(sequence, KMER_LENGTH);
DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
graph.addVertex(v);
return graph;
}
public DeBruijnAssemblyGraph calcGraph() {
DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for (int i = 0; i < kmersInSequence - 1; i++) {
// get the kmers
final byte[] kmer1 = new byte[KMER_LENGTH];
System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH);
final byte[] kmer2 = new byte[KMER_LENGTH];
System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH);
graph.addKmersToGraph(kmer1, kmer2, false, 1);
}
DeBruijnAssembler.mergeNodes(graph);
return graph;
}
}
@DataProvider(name = "MergeNodesWithNoVariationTestProvider")
public Object[][] makeMergeNodesWithNoVariationTests() {
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7);
new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6);
new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66);
new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76);
return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class);
}
@Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = !DEBUG)
public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) {
logger.warn(String.format("Test: %s", cfg.toString()));
Assert.assertTrue(graphEquals(cfg.calcGraph(), cfg.expectedGraph()));
}
// @DataProvider(name = "SimpleMergeOperationsData")
// public Object[][] makeSimpleMergeOperationsData() {
// List<Object[]> tests = new ArrayList<Object[]>();
//
// {
// DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
// DeBruijnVertex v1 = new DeBruijnVertex("AT");
// DeBruijnVertex v2 = new DeBruijnVertex("TC");
// DeBruijnVertex v3 = new DeBruijnVertex("CT");
// DeBruijnVertex v4 = new DeBruijnVertex("TG");
// DeBruijnVertex v5 = new DeBruijnVertex("AG");
// DeBruijnVertex v6 = new DeBruijnVertex("GG");
// DeBruijnVertex v7 = new DeBruijnVertex("GA");
// DeBruijnVertex v8 = new DeBruijnVertex("AA");
//
// graph.addVertices(v1, v2, v3, v4, v5, v6, v7, v8);
// graph.addEdge(v1, v2, new DeBruijnEdge(false, 2));
// graph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
// graph.addEdge(v2, v4, new DeBruijnEdge(false, 5));
// graph.addEdge(v3, v5, new DeBruijnEdge(false, 3));
// graph.addEdge(v4, v6, new DeBruijnEdge(false, 3));
// graph.addEdge(v5, v7, new DeBruijnEdge(false, 2));
// graph.addEdge(v6, v7, new DeBruijnEdge(false, 6));
// graph.addEdge(v7, v8, new DeBruijnEdge(false, 2));
//
// graph.printGraph(new File("unittest.dot"), 1);
//
// DeBruijnAssemblyGraph expected = new DeBruijnAssemblyGraph();
// DeBruijnVertex e1 = new DeBruijnVertex("ATC");
// DeBruijnVertex e2 = new DeBruijnVertex("T");
// DeBruijnVertex e3 = new DeBruijnVertex("G");
// DeBruijnVertex e4 = new DeBruijnVertex("GAA");
//
// expected.addVertices(e1,e2,e3,e4);
// expected.addEdge(e1, e2, new DeBruijnEdge(false, 3));
// expected.addEdge(e1, e3, new DeBruijnEdge(false, 5));
// expected.addEdge(e2, e4, new DeBruijnEdge(false, 2));
// expected.addEdge(e3, e4, new DeBruijnEdge(false, 6));
//
// expected.printGraph(new File("expected.dot"), 1);
//
// tests.add(new Object[]{graph.clone(), expected});
// }
//
// return tests.toArray(new Object[][]{});
// }
//
// @Test(dataProvider = "SimpleMergeOperationsData", enabled = true)
// public void testSimpleMergeOperations(final DeBruijnAssemblyGraph unmergedGraph, final DeBruijnAssemblyGraph expectedGraph) throws Exception {
// final DeBruijnAssemblyGraph mergedGraph = (DeBruijnAssemblyGraph)unmergedGraph.clone();
// DeBruijnAssembler.mergeNodes(mergedGraph);
// mergedGraph.printGraph(new File("merged.dot"), 1);
// DeBruijnAssembler.simplifyMergedGraph(mergedGraph);
// mergedGraph.printGraph(new File("reduced.dot"), 1);
// Assert.assertTrue(graphEquals(mergedGraph, expectedGraph));
// }
@Test(enabled = !DEBUG)
public void testPruneGraph() {
DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
DeBruijnAssemblyGraph expectedGraph = new DeBruijnAssemblyGraph();
DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 1);
DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 1);
DeBruijnVertex v3 = new DeBruijnVertex("ATGGT".getBytes(), 1);
DeBruijnVertex v4 = new DeBruijnVertex("ATGGG".getBytes(), 1);
DeBruijnVertex v5 = new DeBruijnVertex("ATGGC".getBytes(), 1);
DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC".getBytes(), 1);
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new DeBruijnEdge(false, 1));
graph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
graph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
graph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
graph.addEdge(v5, v6, new DeBruijnEdge(false, 2));
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
DeBruijnAssembler.pruneGraph(graph, 2);
Assert.assertTrue(graphEquals(graph, expectedGraph));
graph = new DeBruijnAssemblyGraph();
expectedGraph = new DeBruijnAssemblyGraph();
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new DeBruijnEdge(true, 1));
graph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
graph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
graph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
expectedGraph.addVertex(v);
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v, v2, new DeBruijnEdge(true, 1));
expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
DeBruijnAssembler.pruneGraph(graph, 2);
Assert.assertTrue(graphEquals(graph, expectedGraph));
}
private boolean graphEquals(DeBruijnAssemblyGraph g1, DeBruijnAssemblyGraph g2) {
if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) {
return false;
}
for( DeBruijnEdge e1 : g1.edgeSet() ) {
boolean found = false;
for( DeBruijnEdge e2 : g2.edgeSet() ) {
if( e1.equals(g1, e2, g2) ) { found = true; break; }
}
if( !found ) { return false; }
}
for( DeBruijnEdge e2 : g2.edgeSet() ) {
boolean found = false;
for( DeBruijnEdge e1 : g1.edgeSet() ) {
if( e2.equals(g2, e1, g1) ) { found = true; break; }
}
if( !found ) { return false; }
}
return true;
}
@Test(enabled = !DEBUG)
public void testReferenceCycleGraph() {
String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC";
String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC";
final DeBruijnAssemblyGraph g1 = DeBruijnAssembler.createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true), false);
final DeBruijnAssemblyGraph g2 = DeBruijnAssembler.createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true), false);
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true), false);
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true), false);
Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation.");
Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
@ -313,7 +114,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString;
String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString;
Cigar calculatedCigar = DeBruijnAssembler.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0);
Cigar calculatedCigar = new DeBruijnAssembler().leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0);
Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!");
}
}

View File

@ -75,7 +75,7 @@ public class DeBruijnAssemblyGraphUnitTest {
}
public byte[] calculatedReferenceBytes() {
DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
DeBruijnGraph graph = new DeBruijnGraph();
graph.addSequenceToGraph(refSequence, KMER_LENGTH, true);
if( altSequence.length > 0 ) {
graph.addSequenceToGraph(altSequence, KMER_LENGTH, false);

View File

@ -0,0 +1,69 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.BaseTest;
import org.testng.annotations.Test;
import org.testng.Assert;
public class DeBruijnVertexUnitTest extends BaseTest {
@Test
public void testBasic() {
final byte[] bases = "ACT".getBytes();
final DeBruijnVertex v = new DeBruijnVertex(bases);
Assert.assertEquals(v.getSequence(), bases);
Assert.assertEquals(v.getSequenceString(), new String(bases));
Assert.assertEquals(v.length(), bases.length);
Assert.assertEquals(v.getSuffix().length, 1);
Assert.assertEquals(v.getSuffix()[0], (byte)'T');
Assert.assertEquals(v.getSuffixString(), "T");
Assert.assertEquals(v.getAdditionalSequence(true), bases);
Assert.assertEquals(v.getAdditionalSequence(false).length, 1);
Assert.assertEquals(v.getAdditionalSequence(false)[0], (byte)'T');
}
}

View File

@ -49,14 +49,13 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -79,58 +78,105 @@ public class KBestPathsUnitTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BasicBubbleDataProvider")
@Test(dataProvider = "BasicBubbleDataProvider", enabled = true)
public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) {
// Construct the assembly graph
DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
final int KMER_LENGTH = 3;
SeqGraph graph = new SeqGraph(3);
final String preRef = "ATGG";
final String postRef = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "GGGGC";
final String postRef = "GGGGC";
DeBruijnVertex v = new DeBruijnVertex(preRef.getBytes(), KMER_LENGTH);
DeBruijnVertex v2Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'A', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH);
DeBruijnVertex v2Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'A', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH);
DeBruijnVertex v3 = new DeBruijnVertex(postRef.getBytes(), KMER_LENGTH);
SeqVertex v = new SeqVertex(preRef);
SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength));
SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T");
SeqVertex v3 = new SeqVertex(postRef);
graph.addVertex(v);
graph.addVertex(v2Ref);
graph.addVertex(v2Alt);
graph.addVertex(v3);
graph.addEdge(v, v2Ref, new DeBruijnEdge(true, 10));
graph.addEdge(v2Ref, v3, new DeBruijnEdge(true, 10));
graph.addEdge(v, v2Alt, new DeBruijnEdge(false, 5));
graph.addEdge(v2Alt, v3, new DeBruijnEdge(false, 5));
graph.addEdge(v, v2Ref, new BaseEdge(true, 10));
graph.addEdge(v2Ref, v3, new BaseEdge(true, 10));
graph.addEdge(v, v2Alt, new BaseEdge(false, 5));
graph.addEdge(v2Alt, v3, new BaseEdge(false, 5));
graph.printGraph(new File("test.dot"), 10);
// Construct the test path
KBestPaths.Path path = new KBestPaths.Path(v, graph);
path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt));
path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3));
KBestPaths.Path<SeqVertex> path = new KBestPaths.Path<SeqVertex>(v, graph);
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v, v2Alt));
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v2Alt, v3));
// Construct the actual cigar string implied by the test path
Cigar expectedCigar = new Cigar();
expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M));
if( refBubbleLength > altBubbleLength ) {
expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M));
expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M));
} else if ( refBubbleLength < altBubbleLength ) {
expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M));
expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I));
} else {
expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
}
expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M));
expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M));
Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");
}
// TODO -- test block substitution because it doesn't look like it's correct now
// @Test(dataProvider = "BasicBubbleDataProvider")
// public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) {
// // Construct the assembly graph
// final int KMER_LENGTH = 3;
// SeqGraph graph = new SeqGraph(KMER_LENGTH);
// final String preRef = "ATGG";
// final String postRef = "GGGGC";
//
// SeqVertex v = new SeqVertex(preRef);
// SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength));
// SeqVertex v2Alt = new SeqVertex(Utils.dupString('T', altBubbleLength));
// SeqVertex v3 = new SeqVertex(postRef);
//
// graph.addVertex(v);
// graph.addVertex(v2Ref);
// graph.addVertex(v2Alt);
// graph.addVertex(v3);
// graph.addEdge(v, v2Ref, new BaseEdge(true, 10));
// graph.addEdge(v2Ref, v3, new BaseEdge(true, 10));
// graph.addEdge(v, v2Alt, new BaseEdge(false, 5));
// graph.addEdge(v2Alt, v3, new BaseEdge(false, 5));
//
// graph.printGraph(new File("test.dot"), 10);
//
// // Construct the test path
// KBestPaths.Path<SeqVertex> path = new KBestPaths.Path<SeqVertex>(v, graph);
// path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v, v2Alt));
// path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v2Alt, v3));
//
// // Construct the actual cigar string implied by the test path
// Cigar expectedCigar = new Cigar();
// expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M));
// if( refBubbleLength > altBubbleLength ) {
// expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M));
// expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
// } else if ( refBubbleLength < altBubbleLength ) {
// expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I));
// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
// } else {
// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
// }
// expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M));
//
// Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");
// }
@DataProvider(name = "TripleBubbleDataProvider")
public Object[][] makeTripleBubbleDataProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) {
for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) {
for ( final boolean offRefBeginning : Arrays.asList(false) ) {
for ( final boolean offRefEnding : Arrays.asList(true, false) ) {
for ( final boolean offRefEnding : Arrays.asList(true, false) ) {
for ( final boolean offRefBeginning : Arrays.asList(false) ) {
tests.add(new Object[]{refBubbleLength, altBubbleLength, offRefBeginning, offRefEnding});
}
}
@ -139,30 +185,29 @@ public class KBestPathsUnitTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "TripleBubbleDataProvider")
@Test(dataProvider = "TripleBubbleDataProvider", enabled = true)
public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) {
// Construct the assembly graph
DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
final int KMER_LENGTH = 3;
SeqGraph graph = new SeqGraph();
final String preAltOption = "ATCGATCGATCGATCGATCG";
final String postAltOption = "CCCC";
final String preRef = "ATGG";
final String postRef = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "GGCCG";
final String midRef1 = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "TTCCT";
final String midRef2 = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "CCCAAAAAAAAAAAA";
final String postRef = "GGCCG";
final String midRef1 = "TTCCT";
final String midRef2 = "CCCAAAAAAAAAAAA";
DeBruijnVertex preV = new DeBruijnVertex(preAltOption.getBytes(), KMER_LENGTH);
DeBruijnVertex v = new DeBruijnVertex(preRef.getBytes(), KMER_LENGTH);
DeBruijnVertex v2Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'A', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH);
DeBruijnVertex v2Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'A', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH);
DeBruijnVertex v4Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'C', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH);
DeBruijnVertex v4Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'C', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH);
DeBruijnVertex v6Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'G', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH);
DeBruijnVertex v6Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'G', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH);
DeBruijnVertex v3 = new DeBruijnVertex(midRef1.getBytes(), KMER_LENGTH);
DeBruijnVertex v5 = new DeBruijnVertex(midRef2.getBytes(), KMER_LENGTH);
DeBruijnVertex v7 = new DeBruijnVertex(postRef.getBytes(), KMER_LENGTH);
DeBruijnVertex postV = new DeBruijnVertex(postAltOption.getBytes(), KMER_LENGTH);
SeqVertex preV = new SeqVertex(preAltOption);
SeqVertex v = new SeqVertex(preRef);
SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength));
SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T");
SeqVertex v4Ref = new SeqVertex(Utils.dupString('C', refBubbleLength));
SeqVertex v4Alt = new SeqVertex(Utils.dupString('C', altBubbleLength-1) + "T");
SeqVertex v6Ref = new SeqVertex(Utils.dupString('G', refBubbleLength));
SeqVertex v6Alt = new SeqVertex(Utils.dupString('G', altBubbleLength-1) + "T");
SeqVertex v3 = new SeqVertex(midRef1);
SeqVertex v5 = new SeqVertex(midRef2);
SeqVertex v7 = new SeqVertex(postRef);
SeqVertex postV = new SeqVertex(postAltOption);
graph.addVertex(preV);
graph.addVertex(v);
@ -176,34 +221,36 @@ public class KBestPathsUnitTest {
graph.addVertex(v6Alt);
graph.addVertex(v7);
graph.addVertex(postV);
graph.addEdge(preV, v, new DeBruijnEdge(false, 1));
graph.addEdge(v, v2Ref, new DeBruijnEdge(true, 10));
graph.addEdge(v2Ref, v3, new DeBruijnEdge(true, 10));
graph.addEdge(v, v2Alt, new DeBruijnEdge(false, 5));
graph.addEdge(v2Alt, v3, new DeBruijnEdge(false, 5));
graph.addEdge(v3, v4Ref, new DeBruijnEdge(true, 10));
graph.addEdge(v4Ref, v5, new DeBruijnEdge(true, 10));
graph.addEdge(v3, v4Alt, new DeBruijnEdge(false, 5));
graph.addEdge(v4Alt, v5, new DeBruijnEdge(false, 5));
graph.addEdge(v5, v6Ref, new DeBruijnEdge(true, 11));
graph.addEdge(v6Ref, v7, new DeBruijnEdge(true, 11));
graph.addEdge(v5, v6Alt, new DeBruijnEdge(false, 55));
graph.addEdge(v6Alt, v7, new DeBruijnEdge(false, 55));
graph.addEdge(v7, postV, new DeBruijnEdge(false, 1));
graph.addEdge(preV, v, new BaseEdge(false, 1));
graph.addEdge(v, v2Ref, new BaseEdge(true, 10));
graph.addEdge(v2Ref, v3, new BaseEdge(true, 10));
graph.addEdge(v, v2Alt, new BaseEdge(false, 5));
graph.addEdge(v2Alt, v3, new BaseEdge(false, 5));
graph.addEdge(v3, v4Ref, new BaseEdge(true, 10));
graph.addEdge(v4Ref, v5, new BaseEdge(true, 10));
graph.addEdge(v3, v4Alt, new BaseEdge(false, 5));
graph.addEdge(v4Alt, v5, new BaseEdge(false, 5));
graph.addEdge(v5, v6Ref, new BaseEdge(true, 11));
graph.addEdge(v6Ref, v7, new BaseEdge(true, 11));
graph.addEdge(v5, v6Alt, new BaseEdge(false, 55));
graph.addEdge(v6Alt, v7, new BaseEdge(false, 55));
graph.addEdge(v7, postV, new BaseEdge(false, 1));
graph.printGraph(new File("test.debruijn.dot"), 10);
// Construct the test path
KBestPaths.Path path = new KBestPaths.Path( (offRefBeginning ? preV : v), graph);
KBestPaths.Path<SeqVertex> path = new KBestPaths.Path<SeqVertex>( (offRefBeginning ? preV : v), graph);
if( offRefBeginning ) {
path = new KBestPaths.Path(path, graph.getEdge(preV, v));
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(preV, v));
}
path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt));
path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3));
path = new KBestPaths.Path(path, graph.getEdge(v3, v4Ref));
path = new KBestPaths.Path(path, graph.getEdge(v4Ref, v5));
path = new KBestPaths.Path(path, graph.getEdge(v5, v6Alt));
path = new KBestPaths.Path(path, graph.getEdge(v6Alt, v7));
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v, v2Alt));
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v2Alt, v3));
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v3, v4Ref));
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v4Ref, v5));
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v5, v6Alt));
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v6Alt, v7));
if( offRefEnding ) {
path = new KBestPaths.Path(path, graph.getEdge(v7,postV));
path = new KBestPaths.Path<SeqVertex>(path, graph.getEdge(v7,postV));
}
// Construct the actual cigar string implied by the test path
@ -211,7 +258,7 @@ public class KBestPathsUnitTest {
if( offRefBeginning ) {
expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I));
}
expectedCigar.add(new CigarElement(preRef.length() - (KMER_LENGTH - 1), CigarOperator.M));
expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M));
// first bubble
if( refBubbleLength > altBubbleLength ) {
expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
@ -222,10 +269,10 @@ public class KBestPathsUnitTest {
} else {
expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
}
expectedCigar.add(new CigarElement(midRef1.length() - (KMER_LENGTH - 1), CigarOperator.M));
expectedCigar.add(new CigarElement(midRef1.length(), CigarOperator.M));
// second bubble is ref path
expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
expectedCigar.add(new CigarElement(midRef2.length() - (KMER_LENGTH - 1), CigarOperator.M));
expectedCigar.add(new CigarElement(midRef2.length(), CigarOperator.M));
// third bubble
if( refBubbleLength > altBubbleLength ) {
expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
@ -236,9 +283,9 @@ public class KBestPathsUnitTest {
} else {
expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
}
expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M));
expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M));
if( offRefEnding ) {
expectedCigar.add(new CigarElement(postAltOption.length() - (KMER_LENGTH - 1), CigarOperator.I));
expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I));
}
Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");

View File

@ -55,6 +55,8 @@ public class KMerErrorCorrectorUnitTest extends BaseTest {
public void testMyData() {
final KMerErrorCorrector corrector = new KMerErrorCorrector(3, 1, 2, 2);
Assert.assertNotNull(corrector.toString());
corrector.addKmers(
"ATG", "ATG", "ATG", "ATG",
"ACC", "ACC", "ACC",
@ -66,13 +68,20 @@ public class KMerErrorCorrectorUnitTest extends BaseTest {
"NNC" // => ACC [because of min count won't go to NNA]
);
Assert.assertEquals(corrector.getErrorCorrectedKmer("ATG"), "ATG");
Assert.assertEquals(corrector.getErrorCorrectedKmer("ACC"), "ACC");
Assert.assertEquals(corrector.getErrorCorrectedKmer("AAA"), "AAA");
Assert.assertEquals(corrector.getErrorCorrectedKmer("CTG"), "ATG");
Assert.assertEquals(corrector.getErrorCorrectedKmer("NNA"), "AAA");
Assert.assertEquals(corrector.getErrorCorrectedKmer("CCC"), "ACC");
Assert.assertEquals(corrector.getErrorCorrectedKmer("NNN"), null);
Assert.assertEquals(corrector.getErrorCorrectedKmer("NNC"), "ACC");
testCorrection(corrector, "ATG", "ATG");
testCorrection(corrector, "ACC", "ACC");
testCorrection(corrector, "AAA", "AAA");
testCorrection(corrector, "CTG", "ATG");
testCorrection(corrector, "NNA", "AAA");
testCorrection(corrector, "CCC", "ACC");
testCorrection(corrector, "NNN", null);
testCorrection(corrector, "NNC", "ACC");
Assert.assertNotNull(corrector.toString());
}
private void testCorrection(final KMerErrorCorrector corrector, final String in, final String out) {
Assert.assertEquals(corrector.getErrorCorrectedKmer(in), out);
Assert.assertEquals(corrector.getErrorCorrectedKmer(in.getBytes()), out == null ? null : out.getBytes());
}
}

View File

@ -0,0 +1,106 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
public class SeqGraphUnitTest extends BaseTest {
private class MergeNodesWithNoVariationTestProvider extends TestDataProvider {
public byte[] sequence;
public int KMER_LENGTH;
public MergeNodesWithNoVariationTestProvider(String seq, int kmer) {
super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq));
sequence = seq.getBytes();
KMER_LENGTH = kmer;
}
public SeqGraph calcGraph() {
final DeBruijnGraph deBruijnGraph = new DeBruijnGraph();
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for (int i = 0; i < kmersInSequence - 1; i++) {
// get the kmers
final byte[] kmer1 = new byte[KMER_LENGTH];
System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH);
final byte[] kmer2 = new byte[KMER_LENGTH];
System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH);
deBruijnGraph.addKmersToGraph(kmer1, kmer2, false, 1);
}
final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
seqGraph.mergeNodes();
return seqGraph;
}
}
@DataProvider(name = "MergeNodesWithNoVariationTestProvider")
public Object[][] makeMergeNodesWithNoVariationTests() {
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6);
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7);
new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6);
new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66);
new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76);
return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class);
}
@Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = true)
public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) {
logger.warn(String.format("Test: %s", cfg.toString()));
final SeqGraph actual = cfg.calcGraph();
Assert.assertEquals(actual.vertexSet().size(), 1);
final SeqVertex actualV = actual.vertexSet().iterator().next();
Assert.assertEquals(actualV.getSequence(), cfg.sequence);
}
}

View File

@ -0,0 +1,109 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class SeqVertexUnitTest extends BaseTest {
@Test
public void testBasic() {
final byte[] bases = "ACT".getBytes();
final SeqVertex v1 = new SeqVertex(bases);
final SeqVertex v2 = new SeqVertex(bases);
Assert.assertTrue(v1.getId() >= 0);
Assert.assertTrue(v2.getId() >= 0);
Assert.assertTrue(v2.getId() > v1.getId());
}
@Test
public void testEqualsAndHashCode() {
final byte[] bases = "ACT".getBytes();
final SeqVertex v1 = new SeqVertex(bases);
final SeqVertex v1_neq = new SeqVertex(bases);
final SeqVertex v1_eq = new SeqVertex(v1);
Assert.assertEquals(v1, v1);
Assert.assertEquals(v1.hashCode(), v1.hashCode());
Assert.assertEquals(v1, v1_eq);
Assert.assertEquals(v1.hashCode(), v1_eq.hashCode());
Assert.assertFalse(v1.equals(v1_neq));
Assert.assertFalse(v1_neq.equals(v1));
Assert.assertFalse(v1_neq.hashCode() == v1.hashCode());
}
@DataProvider(name = "WithoutSuffixData")
public Object[][] makeWithoutSuffixData() {
List<Object[]> tests = new ArrayList<Object[]>();
final String bases = "ACGTACGTACGT";
final int l = bases.length();
for ( int suffixLength = 0; suffixLength <= l; suffixLength++ ) {
final int suffixStart = l - suffixLength;
final String prefix = suffixLength == l ? null : bases.substring(0, suffixStart);
final String suffix = suffixStart == l ? "" : bases.substring(suffixStart, l);
tests.add(new Object[]{bases, suffix, prefix});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "WithoutSuffixData")
public void testWithoutSuffix(final String bases, final String suffix, final String expected) {
final SeqVertex basesSV = new SeqVertex(bases);
if ( expected == null )
Assert.assertNull(basesSV.withoutSuffix(suffix.getBytes()), "Failed for bases " + bases + " with suffix " + suffix + " != " + expected);
else
Assert.assertEquals(basesSV.withoutSuffix(suffix.getBytes()).getSequenceString(), expected, "Failed for bases " + bases + " with suffix " + suffix + " != " + expected);
}
}

View File

@ -795,4 +795,17 @@ public class Utils {
while (md5String.length() < 32) md5String = "0" + md5String; // pad to length 32
return md5String;
}
/**
* Does big end with the exact sequence of bytes in suffix?
*
* @param big a non-null byte[] to test if it a prefix + suffix
* @param suffix a non-null byte[] to test if it's a suffix of big
* @return true if big is proper byte[] composed of some prefix + suffix
*/
public static boolean endsWith(final byte[] big, final byte[] suffix) {
if ( big == null ) throw new IllegalArgumentException("big cannot be null");
if ( suffix == null ) throw new IllegalArgumentException("suffix cannot be null");
return new String(big).endsWith(new String(suffix));
}
}