Merge pull request #559 from broadinstitute/vrr_assembly_graph_edge_info_revise

Improved criteria to select best haplotypes out from the assembly graph.
This commit is contained in:
Ryan Poplin 2014-03-17 09:27:19 -04:00
commit c1b4390691
27 changed files with 802 additions and 173 deletions

View File

@ -213,16 +213,18 @@ public abstract class LocalAssemblyEngine {
final Map<SeqGraph,AssemblyResult> assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) {
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
returnHaplotypes.add( refHaplotype );
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final ArrayList<KBestHaplotypeFinder> finders = new ArrayList<>(graphs.size());
for( final SeqGraph graph : graphs ) {
final SeqVertex source = graph.getReferenceSourceVertex();
final SeqVertex sink = graph.getReferenceSinkVertex();
if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
final KBestHaplotypeFinder haplotypeFinder = new KBestHaplotypeFinder(graph,source,sink);
finders.add(haplotypeFinder);
final Iterator<KBestHaplotype> bestHaplotypes = haplotypeFinder.iterator(numBestHaplotypesPerGraph);
while (bestHaplotypes.hasNext()) {
final KBestHaplotype kBestHaplotype = bestHaplotypes.next();
final Haplotype h = kBestHaplotype.haplotype();
@ -256,9 +258,19 @@ public abstract class LocalAssemblyEngine {
}
}
if ( returnHaplotypes.size() < returnHaplotypes.size() )
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
// Make sure that the ref haplotype is amongst the return haplotypes and calculate its score as
// the first returned by any finder.
if (!returnHaplotypes.contains(refHaplotype)) {
double refScore = Double.NaN;
for (final KBestHaplotypeFinder finder : finders) {
final double candidate = finder.score(refHaplotype);
if (Double.isNaN(candidate)) continue;
refScore = candidate;
break;
}
refHaplotype.setScore(refScore);
returnHaplotypes.add(refHaplotype);
}
if( debug ) {
if( returnHaplotypes.size() > 1 ) {

View File

@ -45,21 +45,21 @@
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import java.util.ArrayList;
import java.util.Collection;
import java.util.PriorityQueue;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.*;
/**
* K-best sub-haplotype finder that selects the best solutions out of a collection of sub-haplotype finders.
*
* @author Valentin Ruano-Rubio &lt;valentin@broadinstitute.org&gt;
*/
class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
class AggregatedSubHaplotypeFinder<F extends KBestSubHaplotypeFinder> implements KBestSubHaplotypeFinder {
/**
* Collection of subFinders that provided the actual solutions.
*/
private final Collection<? extends KBestSubHaplotypeFinder> subFinders;
protected final Collection<F> subFinders;
/**
* Flag indicating whether the sub-finders have been processed or not.
@ -89,17 +89,53 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
* Creates a new aggregated sub-haplotype finder given its sub-finders.
* @param finders set of sub-finders.
*/
public AggregatedSubHaplotypeFinder(final Collection<? extends KBestSubHaplotypeFinder> finders) {
public AggregatedSubHaplotypeFinder(final Collection<F> finders) {
if (finders == null) throw new IllegalArgumentException("finder collection cannot be null");
this.subFinders = finders;
}
@Override
public String id() {
final StringBuilder resultBuilder = new StringBuilder();
for (final KBestSubHaplotypeFinder subFinder : subFinders)
resultBuilder.append(subFinder.id());
return resultBuilder.toString();
}
@Override
public String label() {
return "&lt;OR&gt;";
}
@Override
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
final int subFinderCount = subFinders.size();
final String edgeCost = String.format("%.2f",-Math.log10((double) subFinderCount));
final Set<Pair<? extends KBestSubHaplotypeFinder,String>> result = new LinkedHashSet<>(subFinderCount);
for (final KBestSubHaplotypeFinder subFinder : subFinders)
result.add(new Pair<>(subFinder,edgeCost));
return result;
}
@Override
public int getCount() {
processSubFindersIfNeeded();
return count;
}
@Override
public double score(final byte[] bases, final int offset, final int length) {
if (bases == null) throw new IllegalArgumentException("bases cannot be null");
if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative");
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size");
for (final KBestSubHaplotypeFinder subFinder : subFinders) {
final double score = subFinder.score(bases,offset,length);
if (!Double.isNaN(score)) return score;
}
return Double.NaN;
}
private void processSubFindersIfNeeded() {
if (processedSubFinders) return;
@ -144,6 +180,11 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
return rankedSubHaplotype.get(k);
}
@Override
public boolean isReference() {
return false;
}
/**
* Custom implementation of {@link KBestHaplotype} to encapsulate sub-finder results.
*/
@ -167,7 +208,7 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
}
@Override
public int score() {
public double score() {
return result.score();
}

View File

@ -52,6 +52,7 @@ import com.google.java.contract.Requires;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.jgrapht.EdgeFactory;
import org.jgrapht.alg.CycleDetector;
import org.jgrapht.graph.DefaultDirectedGraph;
import java.io.File;
@ -146,6 +147,39 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
return set;
}
/**
* Convert this kmer graph to a simple sequence graph.
*
* Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
* graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
*
* @return a newly allocated SequenceGraph
*/
public SeqGraph convertToSequenceGraph() {
final SeqGraph seqGraph = new SeqGraph(kmerSize);
final Map<V, SeqVertex> vertexMap = new HashMap<>();
// create all of the equivalent seq graph vertices
for ( final V dv : vertexSet() ) {
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
sv.setAdditionalInfo(dv.additionalInfo());
vertexMap.put(dv, sv);
seqGraph.addVertex(sv);
}
// walk through the nodes and connect them to their equivalent seq vertices
for( final E e : edgeSet() ) {
final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
//logger.info("Adding edge " + seqInV + " -> " + seqOutV);
seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity()));
}
return seqGraph;
}
/**
* Pull out the additional sequence implied by traversing this node in the graph
* @param v the vertex from which to pull out the additional base sequence
@ -712,4 +746,13 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
if (!containsVertex(vertex)) return false;
return true;
}
/**
* Checks for the presence of directed cycles in the graph.
*
* @return {@code true} if the graph has cycles, {@code false} otherwise.
*/
public boolean hasCycles() {
return new CycleDetector<>(this).detectCycles();
}
}

View File

@ -122,7 +122,7 @@ public class CommonSuffixSplitter {
} else {
incomingTarget = prefixV;
graph.addVertex(prefixV);
graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0));
graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 1));
edgesToRemove.add(out);
}

View File

@ -45,6 +45,11 @@
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.Collections;
import java.util.Set;
/**
* Represents a trivial k-best sub haplotype finder with no solutions.
*
@ -65,6 +70,21 @@ final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder {
protected DeadEndKBestSubHaplotypeFinder() {
}
@Override
public String id() {
return "<DEAD>";
}
@Override
public String label() {
return "&lt;DEAD&gt;";
}
@Override
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
return Collections.emptySet();
}
@Override
public int getCount() {
return 0;
@ -77,4 +97,18 @@ final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder {
else
throw new IllegalArgumentException("k cannot be equal or greater to the haplotype count");
}
@Override
public boolean isReference() {
return false;
}
@Override
public double score(final byte[] bases, final int offset, final int length) {
if (bases == null) throw new IllegalArgumentException("bases cannot be null");
if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative");
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size");
return Double.NaN;
}
}

View File

@ -45,6 +45,12 @@
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.Collections;
import java.util.Set;
/**
* Trivial k-best sub-haplotype finder where the source and sink vertex are the same one.
*
@ -67,6 +73,21 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder {
singleHaplotypePath = new MyBestHaplotypePath(graph,vertex);
}
@Override
public String id() {
return "v" + singleHaplotypePath.head().getId();
}
@Override
public String label() {
return singleHaplotypePath.head().getSequenceString();
}
@Override
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
return Collections.emptySet();
}
@Override
public int getCount() {
return 1;
@ -81,6 +102,24 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder {
return singleHaplotypePath;
}
@Override
public boolean isReference() {
return singleHaplotypePath.isReference();
}
@Override
public double score(final byte[] bases, final int offset, final int length) {
if (bases == null) throw new IllegalArgumentException("bases cannot be null");
if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative");
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size");
final byte[] vertexBases = singleHaplotypePath.head().getSequence();
if (length != vertexBases.length)
return Double.NaN;
else
return Utils.equalRange(bases, offset, vertexBases, 0, length)? 0 : Double.NaN;
}
/**
* Custom extension of {@link KBestHaplotype} that implements the single solution behaviour.
*/
@ -120,7 +159,7 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder {
}
@Override
public int score() {
public double score() {
return 0;
}

View File

@ -68,7 +68,7 @@ public abstract class KBestHaplotype implements Comparable<KBestHaplotype> {
*
* @return 0 or greater.
*/
public abstract int score();
public abstract double score();
/**
* Indicates whether this result is the reference haplotype.
@ -122,6 +122,8 @@ public abstract class KBestHaplotype implements Comparable<KBestHaplotype> {
public Haplotype haplotype() {
if (haplotype != null) return haplotype;
haplotype = new Haplotype(bases(),isReference());
if (score() > 0)
throw new IllegalStateException("score cannot be greater than 0: " + score());
haplotype.setScore(score());
return haplotype;
}
@ -152,7 +154,35 @@ public abstract class KBestHaplotype implements Comparable<KBestHaplotype> {
*/
public int compareTo(final KBestHaplotype other) {
if (other == null) throw new IllegalArgumentException("the other object cannot be null");
return - 1 * (score() - other.score());
return - Double.compare(score(), other.score());
}
@Override
public int hashCode() {
return haplotype().hashCode();
}
@Override
public boolean equals(final Object other) {
return other == null ? false: (other instanceof KBestHaplotype ? equals((KBestHaplotype)other) : false);
}
@Override
public String toString() {
return haplotype().toString() + " Score = " + score();
}
/**
* Checks whether both solutions are equal.
* <p>
* Both solutions are considered equal when the underlying haplotypes are equal. The path on the respective
* graph might deffer though.
* </p>
*
* @return {@code true} iff both haplotypes are the same (considering the ref state).
*/
protected boolean equals(final KBestHaplotype other) {
return haplotype().equals(other.haplotype(),false);
}
/**

View File

@ -45,8 +45,13 @@
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.jgrapht.alg.CycleDetector;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.*;
/**
@ -233,7 +238,7 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
}
@Override
public KBestHaplotype get(int index) {
public KBestHaplotype get(final int index) {
if (index < 0 || index >= size())
throw new IndexOutOfBoundsException();
return topFinder.getKBest(index);
@ -305,28 +310,28 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
/**
* Creates a finder from a vertex.
*
* @param source the source vertex for the finder.
* @param vertex the source vertex for the finder.
*
* @return never {@code null}, perhaps a finder that returns no haplotypes though.
*/
protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex source) {
KBestSubHaplotypeFinder node = finderByVertex.get(source);
if (node == null) {
if (sinks.contains(source))
node = new EmptyPathHaplotypeFinderNode(graph,source);
protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex vertex) {
KBestSubHaplotypeFinder finder = finderByVertex.get(vertex);
if (finder == null) {
if (sinks.contains(vertex))
finder = new EmptyPathHaplotypeFinderNode(graph,vertex);
else {
final Set<BaseEdge> outgoingEdges = graph.outgoingEdgesOf(source);
final Set<BaseEdge> outgoingEdges = graph.outgoingEdgesOf(vertex);
if (outgoingEdges.isEmpty())
node = DeadEndKBestSubHaplotypeFinder.INSTANCE;
finder = DeadEndKBestSubHaplotypeFinder.INSTANCE;
else {
final Map<BaseEdge,KBestSubHaplotypeFinder> undeadChildren = createChildrenFinders(outgoingEdges);
node = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE :
new RecursiveSubHaplotypeFinder(source,undeadChildren);
finder = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE :
new RecursiveSubHaplotypeFinder(graph,vertex,undeadChildren);
}
}
finderByVertex.put(source, node);
finderByVertex.put(vertex, finder);
}
return node;
return finder;
}
/**
@ -340,7 +345,7 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
* @return never {@code null}, perhaps an empty map if there is no children with valid paths to any sink for this
* finder.
*/
private Map<BaseEdge, KBestSubHaplotypeFinder> createChildrenFinders(Set<BaseEdge> baseEdges) {
private Map<BaseEdge, KBestSubHaplotypeFinder> createChildrenFinders(final Set<BaseEdge> baseEdges) {
final Map<BaseEdge,KBestSubHaplotypeFinder> result = new LinkedHashMap<>(baseEdges.size());
for (final BaseEdge edge : baseEdges) {
final KBestSubHaplotypeFinder targetFinder = createVertexFinder(graph.getEdgeTarget(edge));
@ -349,4 +354,156 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
}
return result;
}
/**
* Print a DOT representation of search graph.
*
* @param out character stream printer where to print the DOT representation to.
*
* @throws IllegalArgumentException if {@code out} is {@code null}.
*/
public void printDOT(final PrintWriter out) {
if (out == null)
throw new IllegalArgumentException("the out writer cannot be null");
out.println("digraph {");
out.println(" rankdir = LR");
out.println(" node [shape=box, margin=0.01]");
out.println(" subgraph cluster_dummy { style = invis; x [label=\"\",shape=none,margin=0] }");
final StringBuilder referenceCluster = new StringBuilder(1000);
referenceCluster.append(" subgraph cluster_ref {\n");
referenceCluster.append(" node [penwidth=2]\n");
for (final KBestSubHaplotypeFinder finder : finderByVertex.values() ) {
final String id = finder.id();
final String line = String.format(" %s [label=<%s>]",id,finder.label());
if (finder.isReference())
referenceCluster.append(" ").append(line).append('\n');
else
out.println(line);
}
referenceCluster.append(" }");
out.println(referenceCluster.toString());
for (final KBestSubHaplotypeFinder finder : finderByVertex.values())
for (final Pair<? extends KBestSubHaplotypeFinder,String> subFinderLabel : finder.subFinderLabels()) {
final KBestSubHaplotypeFinder subFinder = subFinderLabel.getFirst();
final String edgeLabel = subFinderLabel.getSecond();
out.println(String.format(" %s -> %s [label=%s]",finder.id(),subFinder.id(),edgeLabel));
}
out.println("}");
}
/**
* Print a DOT representation of search graph.
*
* @param file file where to print the DOT representation to.
*
* @throws IllegalArgumentException if {@code file} is {@code null}.
* @throws FileNotFoundException if {@code file} cannot be created or written.
* @throws IllegalStateException if there was some trouble when writing the DOT representation.
*/
public void printDOT(final File file) throws FileNotFoundException {
if (file == null)
throw new IllegalArgumentException("the output file cannot be null");
final PrintWriter out = new PrintWriter(file);
printDOT(out);
if (out.checkError())
throw new IllegalStateException("error occurred while writing k-best haplotype search graph into file '"
+ file.getAbsolutePath() + "'");
out.close();
}
/**
* Print a DOT representation of search graph.
*
* @param fileName name of the file where to print the DOT representation to.
*
* @throws IllegalArgumentException if {@code fileName} is {@code null}.
* @throws FileNotFoundException if no file named {@code fileName} cannot be created or written.
* @throws IllegalStateException if there was some trouble when writing the DOT representation.
*/
@SuppressWarnings("unused") // Available for debugging purposes.
public void printDOTFile(final String fileName) throws FileNotFoundException {
printDOT(new File(fileName));
}
/**
* Get the score of a give sequence of bases
*
* @param bases the base sequence.
*
* @return {@link Double#NaN} if there is no score for the sequence, i.e. there is no such a haplotype accessible
* throw this finder.
*/
public double score(final byte[] bases) {
return topFinder.score(bases,0,bases.length);
}
/**
* Get the score of a give sequence of bases
*
* @param haplotype the haplotype.
*
* @return {@link Double#NaN} if there is no score for the sequence, i.e. there is no such a haplotype accessible
* throw this finder.
*/
public double score(final Haplotype haplotype) {
return score(haplotype.getBases());
}
/**
* Returns a unique list of haplotypes solutions.
* <p>
* The result will not contain more than one haplotype with the same base sequence. The solution of the best
* score is returned.
* </p>
* <p>
* This makes sense when there are more than one possible path through the graph to create the same haplotype.
* </p>
* <p>
* The resulting list is sorted by the score with more likely haplotype search results first.
* </p>
*
* @param maxSize maximum number of unique results to return.
*
* @throws IllegalArgumentException if {@code maxSize} is negative.
*
* @return never {@code null}, perhaps an empty list.
*/
public List<KBestHaplotype> unique(final int maxSize) {
if (maxSize < 0) throw new IllegalArgumentException("maxSize cannot be negative");
final int requiredCapacity = Math.min(maxSize,size());
final Set<Haplotype> haplotypes = new HashSet<>(requiredCapacity);
int resultSize = 0;
final List<KBestHaplotype> result = new ArrayList<>(requiredCapacity);
for (final KBestHaplotype kbh : this) {
if (haplotypes.add(kbh.haplotype())) {
result.add(kbh);
if (resultSize == maxSize) break;
}
}
return result;
}
/**
* Returns a unique list of haplotypes solutions.
*
* <p>
* The result will not contain more than one haplotype with the same base sequence. The solution of the best
* score is returned.
* </p>
* <p>
* This makes sense when there are more than one possible path through the graph to create the same haplotype.
* </p>
* <p>
* The resulting list is sorted by the score with more likely haplotype search results first.
* </p>
*
* @return never {@code null}, perhaps an empty list.
*/
public List<KBestHaplotype> unique() {
return unique(size());
}
}

View File

@ -45,6 +45,10 @@
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.Set;
/**
* Common interface for K-Best sub-haplotype finders.
*
@ -52,6 +56,29 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
*/
interface KBestSubHaplotypeFinder {
/**
* Return an unique id for this sub-haplotype finder to be used when outputting diagrams.
*
* @return never {@code null}.
*/
public String id();
/**
* Returns a label with human readable representation of this finder.
*
* <p>This is used when generating a diagram to illustrate the search space and costs</p>
*
* @return never {@code null}.
*/
public String label();
/**
* Returns the set of subfinder from this finder together with a label for the connection with the current finder.
*
* <p>The label is used when generating a diagram to illustrate the search space and costs</p>
*/
public Set<Pair<? extends KBestSubHaplotypeFinder,String>> subFinderLabels();
/**
* Returns the total number of possible sub-haplotypes.
* @return 0 or greater.
@ -67,5 +94,22 @@ interface KBestSubHaplotypeFinder {
*
* @return never {@code null}.
*/
public abstract KBestHaplotype getKBest(int k);
public KBestHaplotype getKBest(int k);
/**
* Checks whether the top vertex for this finder is a reference haplotype vertex.
*
* @return {@code true} iff the top vertex for this finder is a reference vertex.
*/
public boolean isReference();
/**
* Calculate the score of a sequence of bases.
*
* @param bases array containing the query base sequence.
* @param offset first position of the query base sequence in {@code bases} .
* @param length length of the query base sequence.
* @return {@link Double#NaN} if there is no score for this sequence, otherwise a valid score value.
*/
public double score(byte[] bases, int offset, int length);
}

View File

@ -49,20 +49,24 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import java.util.PriorityQueue;
/**
* edge class for connecting nodes in the graph that tracks some per-sample information
*
* Edge class for connecting nodes in the graph that tracks some per-sample information.
* <p>
* This class extends BaseEdge with the additional functionality of tracking the maximum
* multiplicity seen within any single sample. The workflow for using this class is:
*
* MultiSampleEdge e = new MultiSampleEdge(ref, 1)
* e.incMultiplicity(1) // total is 2, per sample is 2, max per sample is 1
* e.getPruningMultiplicity() // = 1
* e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2
* e.getPruningMultiplicity() // = 2
* e.incMultiplicity(3) // total is 5, per sample is 3, max per sample is 2
* e.getPruningMultiplicity() // = 2
* e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3
* e.getPruningMultiplicity() // = 3
* </p>
* <pre>
* {@code
* MultiSampleEdge e = new MultiSampleEdge(ref, 1)
* e.incMultiplicity(1) // total is 2, per sample is 2, max per sample is 1
* e.getPruningMultiplicity() // = 1
* e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2
* e.getPruningMultiplicity() // = 2
* e.incMultiplicity(3) // total is 5, per sample is 3, max per sample is 2
* e.getPruningMultiplicity() // = 2
* e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3
* e.getPruningMultiplicity() // = 3
* }
* </pre>
*/
public class MultiSampleEdge extends BaseEdge {
private int currentSingleSampleMultiplicity;

View File

@ -45,9 +45,10 @@
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.*;
/**
* General recursive sub-haplotype finder.
@ -67,7 +68,11 @@ import java.util.Map;
*
* @author Valentin Ruano-Rubio &lt;valentin@broadinstitute.org&gt;
*/
class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder<RecursiveSubHaplotypeFinder.EdgeSubHaplotypeFinder> {
private final SeqVertex vertex;
private final boolean isReference;
/**
* Creates a recursive sub-haplotype finder give the target graph, first vertex and all possible outgoing edges
@ -80,20 +85,83 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
* @param vertex first vertex for all sub-haplotype solutions provided by this finder
* @param children map from outgoing edge to the corresponding sub-sub-haplotype finder.
*/
public RecursiveSubHaplotypeFinder(final SeqVertex vertex,
public RecursiveSubHaplotypeFinder(final SeqGraph graph, final SeqVertex vertex,
final Map<BaseEdge, KBestSubHaplotypeFinder> children) {
super(createChildFinderCollection(vertex, children));
this.vertex = vertex;
this.isReference = graph.isReferenceNode(vertex);
}
private static Collection<EdgeSubHaplotypeFinder> createChildFinderCollection(final SeqVertex vertex, final Map<BaseEdge,KBestSubHaplotypeFinder> finders) {
/**
* Wraps the descendant vertices finders in order to take advantage of the {@link AggregatedSubHaplotypeFinder}
* common code.
* <p>
* Automatically calibrates the edge score (cost) so that it takes into account the total across all edges.
* </p>
*
* @param vertex the parent vertex.
* @param finders the child vertices indexed by the connecting edge.
* @return never {@code null} but potentially an empty collection if there is child returning some sub-haplotype
* solution.
*/
private static Collection<EdgeSubHaplotypeFinder> createChildFinderCollection(final SeqVertex vertex,
final Map<BaseEdge,KBestSubHaplotypeFinder> finders) {
if (finders == null) throw new IllegalArgumentException("the edge to child map cannot be null");
final Collection<EdgeSubHaplotypeFinder> result = new ArrayList<>(finders.size());
for (final Map.Entry<BaseEdge,KBestSubHaplotypeFinder> e : finders.entrySet())
result.add(new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue()));
final ArrayList<EdgeSubHaplotypeFinder> result = new ArrayList<>(finders.size());
for (final Map.Entry<BaseEdge,KBestSubHaplotypeFinder> e : finders.entrySet()) {
final EdgeSubHaplotypeFinder subFinder = new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue());
if (subFinder.getCount() == 0) continue;
result.add(subFinder);
}
if (result.size() == 0)
return Collections.emptySet();
else if (result.size() == 1) // no calibration needed, by default edgeScore is 0.
return Collections.singleton(result.get(0));
else {
double totalEdgeMultiplicityAcrossEdges = 0;
for (final EdgeSubHaplotypeFinder finder : result)
totalEdgeMultiplicityAcrossEdges += Math.max(0.5,finder.edge.getMultiplicity());
final double log10TotalEdgeMultiplicityAcrossEdges = Math.log10(totalEdgeMultiplicityAcrossEdges);
for (final EdgeSubHaplotypeFinder finder : result)
finder.calibrateEdgeScore(log10TotalEdgeMultiplicityAcrossEdges);
return result;
}
}
@Override
public boolean isReference() {
return isReference;
}
@Override
public String label() {
return vertex.getSequenceString();
}
@Override
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
final Set<Pair<? extends KBestSubHaplotypeFinder,String>> result = new LinkedHashSet<>(subFinders.size());
for (final EdgeSubHaplotypeFinder subFinder : subFinders)
result.add(new Pair<>(subFinder,simplifyZeros(String.format("%.4f", subFinder.edgeScore))));
return result;
}
private static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder {
/**
* Removes zeros decimal positions from edge-labels.
*
* @param edgeLabel the original label to reformat.
* @return never {@code null}, the reformatted label.
*/
private String simplifyZeros(final String edgeLabel) {
if (edgeLabel.equals("0.000") || edgeLabel.equals("-0.000") )
return "0.";
int i = edgeLabel.length() - 1;
while (edgeLabel.charAt(i) == '0')
i--;
return (i == edgeLabel.length() - 1) ? edgeLabel : edgeLabel.substring(0,i);
}
protected static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder {
private final KBestSubHaplotypeFinder childFinder;
@ -101,10 +169,32 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
private final BaseEdge edge;
private double edgeScore = 0;
private EdgeSubHaplotypeFinder(final SeqVertex vertex, final BaseEdge edge, final KBestSubHaplotypeFinder childFinder) {
this.childFinder = childFinder;
this.edge = edge;
this.vertex = vertex;
this.edgeScore = 0;
}
private void calibrateEdgeScore(final double log10TotalMultiplicityAcrossOutgoingEdges) {
edgeScore = Math.log10(Math.max(edge.getMultiplicity(),0.5)) - log10TotalMultiplicityAcrossOutgoingEdges;
}
@Override
public String id() {
return childFinder.id();
}
@Override
public String label() {
return childFinder.label();
}
@Override
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
return childFinder.subFinderLabels();
}
@Override
@ -114,8 +204,31 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
@Override
public KBestHaplotype getKBest(int k) {
return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k));
return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k),edgeScore);
}
@Override
public boolean isReference() {
return childFinder.isReference();
}
@Override
public double score(final byte[] bases, final int offset, final int length) {
if (length == 0)
return 0;
final byte[] vertexSequence = vertex.getSequence();
if (length < vertexSequence.length) // query is not long enough to have any score.
return Double.NaN;
else if (!Utils.equalRange(vertexSequence,0,bases,offset,vertexSequence.length))
return Double.NaN;
else
return edgeScore + childFinder.score(bases,offset + vertexSequence.length,length - vertexSequence.length);
}
}
@Override
public String id() {
return "v" + vertex.getId();
}
/**
@ -129,13 +242,14 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
*/
private static class ChildKBestSubHaplotype extends KBestHaplotype {
private final int score;
private final double score;
private final KBestHaplotype child;
private final SeqVertex vertex;
private final boolean isReference;
public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child) {
this.score = edge.getMultiplicity() + child.score();
public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child, final double edgeScore) {
this.score = edgeScore + child.score();
this.vertex = vertex;
this.child = child;
this.isReference = edge.isRef() && child.isReference();
@ -147,7 +261,7 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
}
@Override
public int score() {
public double score() {
return score;
}

View File

@ -291,16 +291,9 @@ public class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
final SeqVertex addedVertex = mergeLinearChainVertices(linearChain);
addVertex(addedVertex);
final Set<BaseEdge> inEdges = incomingEdgesOf(first);
final Set<BaseEdge> outEdges = outgoingEdgesOf(last);
final int nEdges = inEdges.size() + outEdges.size();
int sharedWeightAmongEdges = nEdges == 0 ? 0 : sumEdgeWeightAlongChain(linearChain) / nEdges;
final BaseEdge inc = new BaseEdge(false, sharedWeightAmongEdges); // template to make .add function call easy
// update the incoming and outgoing edges to point to the new vertex
for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy().add(inc)); }
for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy().add(inc)); }
for( final BaseEdge edge : outgoingEdgesOf(last) ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy()); }
for( final BaseEdge edge : incomingEdgesOf(first) ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy()); }
removeAllVertices(linearChain);
return true;
@ -313,29 +306,6 @@ public class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
return new SeqVertex( seqsCat );
}
/**
* Get the sum of the edge weights on a linear chain of at least 2 elements
*
* @param chain a linear chain of vertices with at least 2 vertices
* @return the sum of the multiplicities along all edges connecting vertices within the chain
*/
@Requires({"chain != null", "chain.size() >= 2"})
private int sumEdgeWeightAlongChain(final LinkedList<SeqVertex> chain) {
int sum = 0;
SeqVertex prev = null;
for ( final SeqVertex v : chain ) {
if ( prev != null ) {
final BaseEdge e = getEdge(prev, v);
if ( e == null ) throw new IllegalStateException("Something wrong with the linear chain, got a null edge between " + prev + " and " + v);
sum += e.getMultiplicity();
}
prev = v;
}
return sum;
}
/**
* Base class for transformation operations that need to iterate over proposed vertices, where
* each proposed vertex is a seed vertex for a potential transformation.

View File

@ -247,12 +247,12 @@ public class SharedVertexSequenceSplitter {
if ( needPrefixNode ) {
outer.addVertex(prefixV);
if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0));
if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 1));
}
if ( needSuffixNode ) {
outer.addVertex(suffixV);
if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0));
if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 1));
}
if ( topForConnect != null ) {

View File

@ -52,7 +52,6 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.jgrapht.alg.CycleDetector;
import java.io.File;
import java.util.*;
@ -88,8 +87,7 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
/**
*
*/
final boolean debugGraphTransformations;
private final boolean debugGraphTransformations;
final byte minBaseQualityToUseInAssembly;
protected boolean increaseCountsBackwards = true;
@ -319,13 +317,6 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
removeAllVertices(verticesToRemove);
}
/**
* @return true if the graph has cycles, false otherwise
*/
public boolean hasCycles() {
return new CycleDetector<>(this).detectCycles();
}
/**
* Does the graph not have enough complexity? We define low complexity as a situation where the number
* of non-unique kmers is more than 20% of the total number of kmers.
@ -419,39 +410,10 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
return counter.getKmersWithCountsAtLeast(2);
}
/**
* Convert this kmer graph to a simple sequence graph.
*
* Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
* graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
*
* @return a newly allocated SequenceGraph
*/
// TODO -- should override base class method
@Override
public SeqGraph convertToSequenceGraph() {
buildGraphIfNecessary();
final SeqGraph seqGraph = new SeqGraph(kmerSize);
final Map<MultiDeBruijnVertex, SeqVertex> vertexMap = new HashMap<>();
// create all of the equivalent seq graph vertices
for ( final MultiDeBruijnVertex dv : vertexSet() ) {
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
sv.setAdditionalInfo(dv.additionalInfo());
vertexMap.put(dv, sv);
seqGraph.addVertex(sv);
}
// walk through the nodes and connect them to their equivalent seq vertices
for( final MultiSampleEdge e : edgeSet() ) {
final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
//logger.info("Adding edge " + seqInV + " -> " + seqOutV);
seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity()));
}
return seqGraph;
return super.convertToSequenceGraph();
}
private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers,
@ -749,15 +711,15 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
}
private static String pathElementId(final String element) {
final int parentesysPos = element.indexOf('(');
final int openBracketPosition = element.indexOf('(');
if (parentesysPos == -1)
if (openBracketPosition == -1)
return null;
final int closeParentesysPos = element.lastIndexOf(')');
if (closeParentesysPos == -1)
final int closeBracketPosition = element.lastIndexOf(')');
if (closeBracketPosition == -1)
throw new IllegalArgumentException("non-closed id parantesys found in element: " + element);
final String result = element.substring(parentesysPos + 1,closeParentesysPos).trim();
final String result = element.substring(openBracketPosition + 1,closeBracketPosition).trim();
if (result.isEmpty())
throw new IllegalArgumentException("empty id found in element: " + element);
return result;

View File

@ -94,7 +94,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"f50e0b35e2240b19b1b8b6dfa0cf9796");
"5ac3bfe1da1d411b52a98ef3debbd318");
}
private void HCTestComplexConsensusMode(String bam, String args, String md5) {
@ -106,7 +106,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleConsensusModeComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337",
"21e521d51b826450d348e5201684ffe4");
"61972c7c0d378e756f3b4d99aed9d0cf");
}
}

View File

@ -68,8 +68,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50323a284788c8220c9226037c7003b5"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7c16aa8e35de9f418533efac3bae6551"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "7e1e193d70187774f9740d475e0f1cc1"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "96fea2caf0a40df3feb268e8b14da670"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "19efc8020f31d1b68d80c50df0629e50"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "d926d653500a970280ad7828d9ee2b84"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "83ddc16e4f0900429b2da30e582994aa"});

View File

@ -227,7 +227,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("0998be22d7af4372247f5a0338f9446b"));
Arrays.asList("7c3254ead383e2b9a51b242f6de2a5b2"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("1aeed297a3cb41940d83eac499a2ce07"));
Arrays.asList("eda8f91091fe462205d687ec49fc61e7"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -276,7 +276,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("f426f4c2986e1dea8f3f55951ef8e013"));
Arrays.asList("73c52372a1a80f052ea2b728ee17bf22"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("dcb38cb9280f2c3059a09d323db1c633"));
Arrays.asList("4e10d49b8af23d5ef3a28cb702d10a4b"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
@ -298,4 +298,25 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
spec.disableShadowBCF();
executeTest("testGraphBasedNoSuchEdgeBugFix", spec);
}
@Test
public void testLackSensitivityDueToBadHaplotypeSelectionFix() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ",
b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list",
HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e2e6647f7c96e91aeead7301017dc800"));
spec.disableShadowBCF();
executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec);
}
@Test
public void testBadLikelihoodsDueToBadHaplotypeSelectionFix() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ",
hg19RefereneWithChrPrefixInChromosomeNames, privateTestDir + "bad-likelihoods.bam", privateTestDir + "bad-likelihoods.interval_list",
HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbda30145523bf05e0413157f1a00b3e"));
spec.disableShadowBCF();
executeTest("testBadLikelihoodsDueToBadHaplotypeSelectionFix", spec);
}
}

View File

@ -47,7 +47,6 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -61,7 +60,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "1f463bf3a06c401006858bc446ecea54"});
tests.add(new Object[]{nct, "fd9324a574f9204f7308fc1af422fdcc"});
}
return tests.toArray(new Object[][]{});

View File

@ -52,10 +52,11 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
import java.util.*;
public class CommonSuffixMergerUnitTest extends BaseTest {
private final static boolean PRINT_GRAPHS = true;
private final static boolean PRINT_GRAPHS = false;
@DataProvider(name = "CompleteCycleData")
public Object[][] makeCompleteCycleData() {
@ -134,11 +135,35 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
return toUse.toArray(new Object[][]{});
}
/**
* Compares KBestHaplotype solutions, first by the haplotype base sequence and the by their score.
*/
private static final Comparator<KBestHaplotype> KBESTHAPLOTYPE_COMPARATOR = new Comparator<KBestHaplotype>() {
/**
* Compares KBestHaplotype solutions, first by the haplotype base sequence and the by their score.
*
* @return {@inheritDoc}
*/
@Override
public int compare(final KBestHaplotype o1,final KBestHaplotype o2) {
final int baseCmp = o1.haplotype().getBaseString().compareTo(o2.haplotype().getBaseString());
if (baseCmp != 0)
return baseCmp;
return - Double.compare(o1.score(),o2.score());
}
};
public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) {
final KBestHaplotypeFinder originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks());
final KBestHaplotypeFinder actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks());
final List<KBestHaplotype> sortedOriginalKBestHaplotypes = new ArrayList<>(originalKBestHaplotypes);
Collections.sort(sortedOriginalKBestHaplotypes, KBESTHAPLOTYPE_COMPARATOR);
final List<KBestHaplotype> sortedActualKBestHaplotypes = new ArrayList<>(actualKBestHaplotypes);
Collections.sort(sortedActualKBestHaplotypes, KBESTHAPLOTYPE_COMPARATOR);
try {
final Set<String> haplotypes = new HashSet<String>();
final List<KBestHaplotype> originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks());
final List<KBestHaplotype> actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks());
for (final KBestHaplotype kbh : originalKBestHaplotypes)
haplotypes.add(new String(kbh.bases()));
@ -148,14 +173,16 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
}
if ( actualKBestHaplotypes.size() == originalKBestHaplotypes.size() ) {
for ( int i = 0; i < originalKBestHaplotypes.size(); i++ ) {
Assert.assertTrue(actualKBestHaplotypes.get(i).haplotype().getBaseString().equals(originalKBestHaplotypes.get(i).haplotype().getBaseString()), "Paths not equal " + actualKBestHaplotypes.get(i).haplotype() + " vs. original " + originalKBestHaplotypes.get(i).haplotype());
}
}
Assert.assertEquals(sortedActualKBestHaplotypes,sortedOriginalKBestHaplotypes);
} catch ( AssertionError e ) {
if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0);
if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0);
try {
if ( PRINT_GRAPHS ) originalKBestHaplotypes.printDOTFile(String.format("%s.original.finder.dot",name));
if ( PRINT_GRAPHS ) actualKBestHaplotypes.printDOTFile(String.format("%s.actual.finder.dot",name));
} catch (IOException e2) {
// do nothing.
}
throw e;
}
}

View File

@ -65,7 +65,6 @@ import java.util.*;
* User: rpoplin
* Date: 1/31/13
*/
public class KBestHaplotypeFinderUnitTest extends BaseTest {
@DataProvider(name = "BasicPathFindingData")
@ -113,11 +112,11 @@ public class KBestHaplotypeFinderUnitTest extends BaseTest {
final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * nEndNodes;
Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths");
int lastScore = Integer.MAX_VALUE;
double lastScore = 0;
for ( final KBestHaplotype kbh : paths ) {
final Path<SeqVertex,BaseEdge> path = kbh.path();
Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order. Path " + path + " has score above previous " + lastScore);
lastScore = path.getScore();
Assert.assertTrue(kbh.score() <= lastScore, "Paths out of order. Path " + path + " has score " + path.getScore() + " above previous " + lastScore);
lastScore = kbh.score();
}
// get the best path, and make sure it's the same as our optimal path overall

View File

@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.testng.Assert;
@ -226,28 +227,34 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest {
}
final Set<String> haplotypes = new HashSet<>();
final List<KBestHaplotype> originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks());
final KBestHaplotypeFinder originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks());
for ( final KBestHaplotype path : originalPaths )
haplotypes.add(new String(path.bases()));
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
splitter.split();
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0);
if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0);
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".original.dot"), 0);
if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".split.dot"), 0);
splitter.updateGraph(top, bot);
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0);
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".updated.dot"), 0);
final List<KBestHaplotype> splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks());
final KBestHaplotypeFinder splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks());
for ( final KBestHaplotype path : splitPaths ) {
final String h = new String(path.bases());
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
}
if ( splitPaths.size() == originalPaths.size() ) {
for ( int i = 0; i < originalPaths.size(); i++ ) {
Assert.assertTrue(splitPaths.get(i).path().equalScoreAndSequence(originalPaths.get(i).path()), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i));
}
}
final List<byte[]> sortedOriginalPaths = new ArrayList<>(originalPaths.size());
for (final KBestHaplotype kbh : originalPaths.unique())
sortedOriginalPaths.add(kbh.bases());
Collections.sort(sortedOriginalPaths, BaseUtils.BASES_COMPARATOR);
final List<byte[]> sortedSplitPaths = new ArrayList<>(splitPaths.size());
for (final KBestHaplotype kbh : splitPaths.unique())
sortedSplitPaths.add(kbh.bases());
Collections.sort(sortedSplitPaths, BaseUtils.BASES_COMPARATOR);
Assert.assertEquals(sortedSplitPaths,sortedOriginalPaths,Utils.join("_", strings) + "_" + hasTop + "_" + hasBot);
}
@DataProvider(name = "MeetsMinSequenceData")

View File

@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Arrays;
import java.util.Comparator;
/**
* BaseUtils contains some basic utilities for manipulating nucleotides.
@ -589,4 +590,26 @@ public class BaseUtils {
throw new ReviewedStingException("base must be A, C, G or T. " + (char) base + " is not a valid base.");
}
}
/**
* Lexicographical sorting of base arrays {@link Comparator}.
*/
public static final Comparator<byte[]> BASES_COMPARATOR = new Comparator<byte[]> (){
@Override
public int compare(final byte[] o1,final byte[] o2) {
final int minLength = Math.min(o1.length,o2.length);
for (int i = 0; i < minLength; i++) {
final int cmp = Byte.compare(o1[i],o2[i]);
if (cmp != 0) return cmp;
}
if (o1.length == o2.length)
return 0;
else if (o1.length == minLength)
return -1;
else
return 1;
}
};
}

View File

@ -852,4 +852,34 @@ public class Utils {
return lst;
}
/**
* Compares sections from to byte arrays to verify whether they contain the same values.
*
* @param left first array to compare.
* @param leftOffset first position of the first array to compare.
* @param right second array to compare.
* @param rightOffset first position of the second array to compare.
* @param length number of positions to compare.
*
* @throws IllegalArgumentException if <ul>
* <li>either {@code left} or {@code right} is {@code null} or</li>
* <li>any off the offset or length combine point outside any of the two arrays</li>
* </ul>
* @return {@code true} iff {@code length} is 0 or all the bytes in both ranges are the same two-by-two.
*/
public static boolean equalRange(final byte[] left, final int leftOffset, byte[] right, final int rightOffset, final int length) {
if (left == null) throw new IllegalArgumentException("left cannot be null");
if (right == null) throw new IllegalArgumentException("right cannot be null");
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
if (leftOffset < 0) throw new IllegalArgumentException("left offset cannot be negative");
if (leftOffset + length > left.length) throw new IllegalArgumentException("length goes beyond end of left array");
if (rightOffset < 0) throw new IllegalArgumentException("right offset cannot be negative");
if (rightOffset + length > right.length) throw new IllegalArgumentException("length goes beyond end of right array");
for (int i = 0; i < length; i++)
if (left[leftOffset + i] != right[rightOffset + i])
return false;
return true;
}
}

View File

@ -49,7 +49,7 @@ public class Haplotype extends Allele {
private EventMap eventMap = null;
private Cigar cigar;
private int alignmentStartHapwrtRef;
private double score = 0;
private double score = Double.NaN;
/**
* Main constructor
@ -301,7 +301,7 @@ public class Haplotype extends Allele {
* @return a double, where higher values are better
*/
public double getScore() {
return this.isReference() ? Double.MAX_VALUE : score;
return score;
}
/**
@ -312,7 +312,7 @@ public class Haplotype extends Allele {
* @param score a double, where higher values are better
*/
public void setScore(double score) {
this.score = this.isReference() ? Double.MAX_VALUE : score;
this.score = score;
}
/**

View File

@ -91,6 +91,7 @@ public abstract class BaseTest {
//public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta";
public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta";
public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta";
public static final String hg19RefereneWithChrPrefixInChromosomeNames = "/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta";
public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/";
public static final String validationDataLocation = GATKDataLocation + "Validation_Data/";
public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/";

View File

@ -26,9 +26,16 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.testng.Assert;
import org.testng.annotations.Test;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Random;
public class BaseUtilsUnitTest extends BaseTest {
@ -123,4 +130,50 @@ public class BaseUtilsUnitTest extends BaseTest {
Assert.assertTrue(rcObs.equals(rcExp));
}
@Test(dataProvider="baseComparatorData")
public void testBaseComparator(final Collection<byte[]> basesToSort) {
final ArrayList<byte[]> sorted = new ArrayList<>(basesToSort);
Collections.sort(sorted, BaseUtils.BASES_COMPARATOR);
for (int i = 0; i < sorted.size(); i++) {
Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(i)),0);
final String iString = new String(sorted.get(i));
for (int j = i; j < sorted.size(); j++) {
final String jString = new String(sorted.get(j));
if (iString.compareTo(jString) == 0)
Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)),0);
else
Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) * iString.compareTo(jString) > 0);
Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) <= 0);
}
}
}
@DataProvider(name="baseComparatorData")
public Object[][] baseComparatorData() {
final int testCount = 10;
final int testSizeAverage = 10;
final int testSizeDeviation = 10;
final int haplotypeSizeAverage = 100;
final int haplotypeSizeDeviation = 100;
final Object[][] result = new Object[testCount][];
GenomeAnalysisEngine.resetRandomGenerator();
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
for (int i = 0; i < testCount; i++) {
final int size = (int) Math.max(0,rnd.nextDouble() * testSizeDeviation + testSizeAverage);
final ArrayList<byte[]> bases = new ArrayList<>(size);
for (int j = 0; j < size; j++) {
final int jSize = (int) Math.max(0,rnd.nextDouble() * haplotypeSizeDeviation + haplotypeSizeAverage);
final byte[] b = new byte[jSize];
for (int k = 0; k < jSize; k++)
b[k] = BaseUtils.baseIndexToSimpleBase(rnd.nextInt(4));
bases.add(b);
}
result[i] = new Object[] { bases };
}
return result;
}
}

View File

@ -236,4 +236,23 @@ public class UtilsUnitTest extends BaseTest {
public void testTrim(final String s, final int frontTrim, final int backTrim) {
Assert.assertEquals(s.length() - frontTrim - backTrim, Utils.trimArray(s.getBytes(), frontTrim, backTrim).length);
}
@Test(dataProvider = "equalRange", enabled = true)
public void testEqualRange(final byte[] array1, final byte[] array2, final int offset1, final int offset2, final int length, final boolean expected) {
Assert.assertEquals(Utils.equalRange(array1,offset1,array2,offset2,length),expected);
Assert.assertTrue(Utils.equalRange(array1,offset1,array1,offset1,length));
Assert.assertTrue(Utils.equalRange(array2,offset2,array2,offset2,length));
}
@DataProvider(name = "equalRangeData")
public Object[][] equalRangeData() {
return new Object[][] {
new Object[] { new byte[0] , new byte[0], 0, 0, 0, true},
new Object[] { "ABCF".getBytes(), "BC".getBytes(), 1,0,2, true },
new Object[] { "ABCF".getBytes(), "".getBytes(), 1,0,0, true },
new Object[] { "ABCF".getBytes(), "ACBF".getBytes(), 0,0, 4, false}
};
}
}