Improved criteria to select best haplotypes out from the assembly graph.
Currently the best haplotypes are those that accumulate the largest ABSOLUTE edge *multiplicity* sum across their path in the assembly graph. The edge *mulitplicity* is equal to the number of reads that expand through that edge, i.e. have a kmer that uniquely map to some vertex up-stream from the edge and the following base calls extend across that edge to vertices downstream from it. Despite that it is obvious that higher multiplicties correlated with haplotype probability this criterion fails short in some regards of which the most relevant is: As it is evaluated in condensed seq-graph (as supposed to uncompressed read-threading-graphs) it is bias to haplotypes that have more short-sequence vetices ( -> ATGC -> CA -> has worse score than -> A -> T -> G -> C -> C -> A ->). This is partly result of how we modify the edge multiplicities when we merge vertices from a linear chain. This pull-request addresses the problem by changing to a new scoring schema based in likelihood estimates: Each haplotype's likelihood can be calculated as the multiplication of the likelihood of "taking" its edges in the assembly graph. The likelihood of "taking" an edge in the assembly graph is calculated as its multiplicity divide by the sum of multiplicity of edges that share the same source vertex. This pull-request addresses the following stories: https://www.pivotaltracker.com/story/show/66691418 https://www.pivotaltracker.com/story/show/64319760 Change Summary: 1. Change to the new scoring schema. 2. Added a graph DOT printing code to KBestHaplotypeFinder in order to diagnose scoring. 3. Graph transformation have been modified in order to generate no 0-multiplicity edges. (Nevertheless the schema above should work with 0 edges assuming that they are in fact 0.5)
This commit is contained in:
parent
a0c252f084
commit
2e964c59b4
|
|
@ -213,16 +213,18 @@ public abstract class LocalAssemblyEngine {
|
|||
final Map<SeqGraph,AssemblyResult> assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) {
|
||||
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
|
||||
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
|
||||
returnHaplotypes.add( refHaplotype );
|
||||
|
||||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
final ArrayList<KBestHaplotypeFinder> finders = new ArrayList<>(graphs.size());
|
||||
|
||||
for( final SeqGraph graph : graphs ) {
|
||||
final SeqVertex source = graph.getReferenceSourceVertex();
|
||||
final SeqVertex sink = graph.getReferenceSinkVertex();
|
||||
if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
|
||||
final KBestHaplotypeFinder haplotypeFinder = new KBestHaplotypeFinder(graph,source,sink);
|
||||
finders.add(haplotypeFinder);
|
||||
final Iterator<KBestHaplotype> bestHaplotypes = haplotypeFinder.iterator(numBestHaplotypesPerGraph);
|
||||
|
||||
while (bestHaplotypes.hasNext()) {
|
||||
final KBestHaplotype kBestHaplotype = bestHaplotypes.next();
|
||||
final Haplotype h = kBestHaplotype.haplotype();
|
||||
|
|
@ -256,9 +258,19 @@ public abstract class LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
if ( returnHaplotypes.size() < returnHaplotypes.size() )
|
||||
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
|
||||
// Make sure that the ref haplotype is amongst the return haplotypes and calculate its score as
|
||||
// the first returned by any finder.
|
||||
if (!returnHaplotypes.contains(refHaplotype)) {
|
||||
double refScore = Double.NaN;
|
||||
for (final KBestHaplotypeFinder finder : finders) {
|
||||
final double candidate = finder.score(refHaplotype);
|
||||
if (Double.isNaN(candidate)) continue;
|
||||
refScore = candidate;
|
||||
break;
|
||||
}
|
||||
refHaplotype.setScore(refScore);
|
||||
returnHaplotypes.add(refHaplotype);
|
||||
}
|
||||
|
||||
if( debug ) {
|
||||
if( returnHaplotypes.size() > 1 ) {
|
||||
|
|
|
|||
|
|
@ -45,21 +45,21 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.PriorityQueue;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* K-best sub-haplotype finder that selects the best solutions out of a collection of sub-haplotype finders.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
||||
class AggregatedSubHaplotypeFinder<F extends KBestSubHaplotypeFinder> implements KBestSubHaplotypeFinder {
|
||||
|
||||
/**
|
||||
* Collection of subFinders that provided the actual solutions.
|
||||
*/
|
||||
private final Collection<? extends KBestSubHaplotypeFinder> subFinders;
|
||||
protected final Collection<F> subFinders;
|
||||
|
||||
/**
|
||||
* Flag indicating whether the sub-finders have been processed or not.
|
||||
|
|
@ -89,17 +89,53 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
* Creates a new aggregated sub-haplotype finder given its sub-finders.
|
||||
* @param finders set of sub-finders.
|
||||
*/
|
||||
public AggregatedSubHaplotypeFinder(final Collection<? extends KBestSubHaplotypeFinder> finders) {
|
||||
public AggregatedSubHaplotypeFinder(final Collection<F> finders) {
|
||||
if (finders == null) throw new IllegalArgumentException("finder collection cannot be null");
|
||||
this.subFinders = finders;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
final StringBuilder resultBuilder = new StringBuilder();
|
||||
for (final KBestSubHaplotypeFinder subFinder : subFinders)
|
||||
resultBuilder.append(subFinder.id());
|
||||
return resultBuilder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return "<OR>";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
final int subFinderCount = subFinders.size();
|
||||
final String edgeCost = String.format("%.2f",-Math.log10((double) subFinderCount));
|
||||
final Set<Pair<? extends KBestSubHaplotypeFinder,String>> result = new LinkedHashSet<>(subFinderCount);
|
||||
for (final KBestSubHaplotypeFinder subFinder : subFinders)
|
||||
result.add(new Pair<>(subFinder,edgeCost));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
processSubFindersIfNeeded();
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double score(final byte[] bases, final int offset, final int length) {
|
||||
if (bases == null) throw new IllegalArgumentException("bases cannot be null");
|
||||
if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative");
|
||||
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
|
||||
if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size");
|
||||
for (final KBestSubHaplotypeFinder subFinder : subFinders) {
|
||||
final double score = subFinder.score(bases,offset,length);
|
||||
if (!Double.isNaN(score)) return score;
|
||||
}
|
||||
return Double.NaN;
|
||||
}
|
||||
|
||||
private void processSubFindersIfNeeded() {
|
||||
if (processedSubFinders) return;
|
||||
|
||||
|
|
@ -144,6 +180,11 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
return rankedSubHaplotype.get(k);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Custom implementation of {@link KBestHaplotype} to encapsulate sub-finder results.
|
||||
*/
|
||||
|
|
@ -167,7 +208,7 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int score() {
|
||||
public double score() {
|
||||
return result.score();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import com.google.java.contract.Requires;
|
|||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.jgrapht.EdgeFactory;
|
||||
import org.jgrapht.alg.CycleDetector;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -146,6 +147,39 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert this kmer graph to a simple sequence graph.
|
||||
*
|
||||
* Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
|
||||
* graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
|
||||
*
|
||||
* @return a newly allocated SequenceGraph
|
||||
*/
|
||||
public SeqGraph convertToSequenceGraph() {
|
||||
|
||||
final SeqGraph seqGraph = new SeqGraph(kmerSize);
|
||||
final Map<V, SeqVertex> vertexMap = new HashMap<>();
|
||||
|
||||
|
||||
// create all of the equivalent seq graph vertices
|
||||
for ( final V dv : vertexSet() ) {
|
||||
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
|
||||
sv.setAdditionalInfo(dv.additionalInfo());
|
||||
vertexMap.put(dv, sv);
|
||||
seqGraph.addVertex(sv);
|
||||
}
|
||||
|
||||
// walk through the nodes and connect them to their equivalent seq vertices
|
||||
for( final E e : edgeSet() ) {
|
||||
final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
|
||||
final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
|
||||
//logger.info("Adding edge " + seqInV + " -> " + seqOutV);
|
||||
seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity()));
|
||||
}
|
||||
|
||||
return seqGraph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull out the additional sequence implied by traversing this node in the graph
|
||||
* @param v the vertex from which to pull out the additional base sequence
|
||||
|
|
@ -712,4 +746,13 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
if (!containsVertex(vertex)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks for the presence of directed cycles in the graph.
|
||||
*
|
||||
* @return {@code true} if the graph has cycles, {@code false} otherwise.
|
||||
*/
|
||||
public boolean hasCycles() {
|
||||
return new CycleDetector<>(this).detectCycles();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ public class CommonSuffixSplitter {
|
|||
} else {
|
||||
incomingTarget = prefixV;
|
||||
graph.addVertex(prefixV);
|
||||
graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0));
|
||||
graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 1));
|
||||
edgesToRemove.add(out);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -45,6 +45,11 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Represents a trivial k-best sub haplotype finder with no solutions.
|
||||
*
|
||||
|
|
@ -65,6 +70,21 @@ final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
protected DeadEndKBestSubHaplotypeFinder() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
return "<DEAD>";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return "<DEAD>";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
return 0;
|
||||
|
|
@ -77,4 +97,18 @@ final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
else
|
||||
throw new IllegalArgumentException("k cannot be equal or greater to the haplotype count");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double score(final byte[] bases, final int offset, final int length) {
|
||||
if (bases == null) throw new IllegalArgumentException("bases cannot be null");
|
||||
if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative");
|
||||
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
|
||||
if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size");
|
||||
return Double.NaN;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,12 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Trivial k-best sub-haplotype finder where the source and sink vertex are the same one.
|
||||
*
|
||||
|
|
@ -67,6 +73,21 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder {
|
|||
singleHaplotypePath = new MyBestHaplotypePath(graph,vertex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
return "v" + singleHaplotypePath.head().getId();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return singleHaplotypePath.head().getSequenceString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
return 1;
|
||||
|
|
@ -81,6 +102,24 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder {
|
|||
return singleHaplotypePath;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return singleHaplotypePath.isReference();
|
||||
}
|
||||
|
||||
@Override
|
||||
public double score(final byte[] bases, final int offset, final int length) {
|
||||
if (bases == null) throw new IllegalArgumentException("bases cannot be null");
|
||||
if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative");
|
||||
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
|
||||
if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size");
|
||||
final byte[] vertexBases = singleHaplotypePath.head().getSequence();
|
||||
if (length != vertexBases.length)
|
||||
return Double.NaN;
|
||||
else
|
||||
return Utils.equalRange(bases, offset, vertexBases, 0, length)? 0 : Double.NaN;
|
||||
}
|
||||
|
||||
/**
|
||||
* Custom extension of {@link KBestHaplotype} that implements the single solution behaviour.
|
||||
*/
|
||||
|
|
@ -120,7 +159,7 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int score() {
|
||||
public double score() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ public abstract class KBestHaplotype implements Comparable<KBestHaplotype> {
|
|||
*
|
||||
* @return 0 or greater.
|
||||
*/
|
||||
public abstract int score();
|
||||
public abstract double score();
|
||||
|
||||
/**
|
||||
* Indicates whether this result is the reference haplotype.
|
||||
|
|
@ -122,6 +122,8 @@ public abstract class KBestHaplotype implements Comparable<KBestHaplotype> {
|
|||
public Haplotype haplotype() {
|
||||
if (haplotype != null) return haplotype;
|
||||
haplotype = new Haplotype(bases(),isReference());
|
||||
if (score() > 0)
|
||||
throw new IllegalStateException("score cannot be greater than 0: " + score());
|
||||
haplotype.setScore(score());
|
||||
return haplotype;
|
||||
}
|
||||
|
|
@ -152,7 +154,35 @@ public abstract class KBestHaplotype implements Comparable<KBestHaplotype> {
|
|||
*/
|
||||
public int compareTo(final KBestHaplotype other) {
|
||||
if (other == null) throw new IllegalArgumentException("the other object cannot be null");
|
||||
return - 1 * (score() - other.score());
|
||||
return - Double.compare(score(), other.score());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return haplotype().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
return other == null ? false: (other instanceof KBestHaplotype ? equals((KBestHaplotype)other) : false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return haplotype().toString() + " Score = " + score();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether both solutions are equal.
|
||||
* <p>
|
||||
* Both solutions are considered equal when the underlying haplotypes are equal. The path on the respective
|
||||
* graph might deffer though.
|
||||
* </p>
|
||||
*
|
||||
* @return {@code true} iff both haplotypes are the same (considering the ref state).
|
||||
*/
|
||||
protected boolean equals(final KBestHaplotype other) {
|
||||
return haplotype().equals(other.haplotype(),false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -45,8 +45,13 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.jgrapht.alg.CycleDetector;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -233,7 +238,7 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
|
|||
}
|
||||
|
||||
@Override
|
||||
public KBestHaplotype get(int index) {
|
||||
public KBestHaplotype get(final int index) {
|
||||
if (index < 0 || index >= size())
|
||||
throw new IndexOutOfBoundsException();
|
||||
return topFinder.getKBest(index);
|
||||
|
|
@ -305,28 +310,28 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
|
|||
/**
|
||||
* Creates a finder from a vertex.
|
||||
*
|
||||
* @param source the source vertex for the finder.
|
||||
* @param vertex the source vertex for the finder.
|
||||
*
|
||||
* @return never {@code null}, perhaps a finder that returns no haplotypes though.
|
||||
*/
|
||||
protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex source) {
|
||||
KBestSubHaplotypeFinder node = finderByVertex.get(source);
|
||||
if (node == null) {
|
||||
if (sinks.contains(source))
|
||||
node = new EmptyPathHaplotypeFinderNode(graph,source);
|
||||
protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex vertex) {
|
||||
KBestSubHaplotypeFinder finder = finderByVertex.get(vertex);
|
||||
if (finder == null) {
|
||||
if (sinks.contains(vertex))
|
||||
finder = new EmptyPathHaplotypeFinderNode(graph,vertex);
|
||||
else {
|
||||
final Set<BaseEdge> outgoingEdges = graph.outgoingEdgesOf(source);
|
||||
final Set<BaseEdge> outgoingEdges = graph.outgoingEdgesOf(vertex);
|
||||
if (outgoingEdges.isEmpty())
|
||||
node = DeadEndKBestSubHaplotypeFinder.INSTANCE;
|
||||
finder = DeadEndKBestSubHaplotypeFinder.INSTANCE;
|
||||
else {
|
||||
final Map<BaseEdge,KBestSubHaplotypeFinder> undeadChildren = createChildrenFinders(outgoingEdges);
|
||||
node = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE :
|
||||
new RecursiveSubHaplotypeFinder(source,undeadChildren);
|
||||
finder = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE :
|
||||
new RecursiveSubHaplotypeFinder(graph,vertex,undeadChildren);
|
||||
}
|
||||
}
|
||||
finderByVertex.put(source, node);
|
||||
finderByVertex.put(vertex, finder);
|
||||
}
|
||||
return node;
|
||||
return finder;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -340,7 +345,7 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
|
|||
* @return never {@code null}, perhaps an empty map if there is no children with valid paths to any sink for this
|
||||
* finder.
|
||||
*/
|
||||
private Map<BaseEdge, KBestSubHaplotypeFinder> createChildrenFinders(Set<BaseEdge> baseEdges) {
|
||||
private Map<BaseEdge, KBestSubHaplotypeFinder> createChildrenFinders(final Set<BaseEdge> baseEdges) {
|
||||
final Map<BaseEdge,KBestSubHaplotypeFinder> result = new LinkedHashMap<>(baseEdges.size());
|
||||
for (final BaseEdge edge : baseEdges) {
|
||||
final KBestSubHaplotypeFinder targetFinder = createVertexFinder(graph.getEdgeTarget(edge));
|
||||
|
|
@ -349,4 +354,156 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print a DOT representation of search graph.
|
||||
*
|
||||
* @param out character stream printer where to print the DOT representation to.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code out} is {@code null}.
|
||||
*/
|
||||
public void printDOT(final PrintWriter out) {
|
||||
if (out == null)
|
||||
throw new IllegalArgumentException("the out writer cannot be null");
|
||||
out.println("digraph {");
|
||||
out.println(" rankdir = LR");
|
||||
out.println(" node [shape=box, margin=0.01]");
|
||||
out.println(" subgraph cluster_dummy { style = invis; x [label=\"\",shape=none,margin=0] }");
|
||||
final StringBuilder referenceCluster = new StringBuilder(1000);
|
||||
|
||||
referenceCluster.append(" subgraph cluster_ref {\n");
|
||||
referenceCluster.append(" node [penwidth=2]\n");
|
||||
for (final KBestSubHaplotypeFinder finder : finderByVertex.values() ) {
|
||||
final String id = finder.id();
|
||||
final String line = String.format(" %s [label=<%s>]",id,finder.label());
|
||||
if (finder.isReference())
|
||||
referenceCluster.append(" ").append(line).append('\n');
|
||||
else
|
||||
out.println(line);
|
||||
}
|
||||
referenceCluster.append(" }");
|
||||
out.println(referenceCluster.toString());
|
||||
|
||||
for (final KBestSubHaplotypeFinder finder : finderByVertex.values())
|
||||
for (final Pair<? extends KBestSubHaplotypeFinder,String> subFinderLabel : finder.subFinderLabels()) {
|
||||
final KBestSubHaplotypeFinder subFinder = subFinderLabel.getFirst();
|
||||
|
||||
final String edgeLabel = subFinderLabel.getSecond();
|
||||
out.println(String.format(" %s -> %s [label=%s]",finder.id(),subFinder.id(),edgeLabel));
|
||||
}
|
||||
out.println("}");
|
||||
}
|
||||
|
||||
/**
|
||||
* Print a DOT representation of search graph.
|
||||
*
|
||||
* @param file file where to print the DOT representation to.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code file} is {@code null}.
|
||||
* @throws FileNotFoundException if {@code file} cannot be created or written.
|
||||
* @throws IllegalStateException if there was some trouble when writing the DOT representation.
|
||||
*/
|
||||
public void printDOT(final File file) throws FileNotFoundException {
|
||||
if (file == null)
|
||||
throw new IllegalArgumentException("the output file cannot be null");
|
||||
final PrintWriter out = new PrintWriter(file);
|
||||
printDOT(out);
|
||||
if (out.checkError())
|
||||
throw new IllegalStateException("error occurred while writing k-best haplotype search graph into file '"
|
||||
+ file.getAbsolutePath() + "'");
|
||||
out.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Print a DOT representation of search graph.
|
||||
*
|
||||
* @param fileName name of the file where to print the DOT representation to.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code fileName} is {@code null}.
|
||||
* @throws FileNotFoundException if no file named {@code fileName} cannot be created or written.
|
||||
* @throws IllegalStateException if there was some trouble when writing the DOT representation.
|
||||
*/
|
||||
@SuppressWarnings("unused") // Available for debugging purposes.
|
||||
public void printDOTFile(final String fileName) throws FileNotFoundException {
|
||||
printDOT(new File(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the score of a give sequence of bases
|
||||
*
|
||||
* @param bases the base sequence.
|
||||
*
|
||||
* @return {@link Double#NaN} if there is no score for the sequence, i.e. there is no such a haplotype accessible
|
||||
* throw this finder.
|
||||
*/
|
||||
public double score(final byte[] bases) {
|
||||
return topFinder.score(bases,0,bases.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the score of a give sequence of bases
|
||||
*
|
||||
* @param haplotype the haplotype.
|
||||
*
|
||||
* @return {@link Double#NaN} if there is no score for the sequence, i.e. there is no such a haplotype accessible
|
||||
* throw this finder.
|
||||
*/
|
||||
public double score(final Haplotype haplotype) {
|
||||
return score(haplotype.getBases());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a unique list of haplotypes solutions.
|
||||
* <p>
|
||||
* The result will not contain more than one haplotype with the same base sequence. The solution of the best
|
||||
* score is returned.
|
||||
* </p>
|
||||
* <p>
|
||||
* This makes sense when there are more than one possible path through the graph to create the same haplotype.
|
||||
* </p>
|
||||
* <p>
|
||||
* The resulting list is sorted by the score with more likely haplotype search results first.
|
||||
* </p>
|
||||
*
|
||||
* @param maxSize maximum number of unique results to return.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code maxSize} is negative.
|
||||
*
|
||||
* @return never {@code null}, perhaps an empty list.
|
||||
*/
|
||||
public List<KBestHaplotype> unique(final int maxSize) {
|
||||
if (maxSize < 0) throw new IllegalArgumentException("maxSize cannot be negative");
|
||||
final int requiredCapacity = Math.min(maxSize,size());
|
||||
final Set<Haplotype> haplotypes = new HashSet<>(requiredCapacity);
|
||||
int resultSize = 0;
|
||||
final List<KBestHaplotype> result = new ArrayList<>(requiredCapacity);
|
||||
for (final KBestHaplotype kbh : this) {
|
||||
if (haplotypes.add(kbh.haplotype())) {
|
||||
result.add(kbh);
|
||||
if (resultSize == maxSize) break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a unique list of haplotypes solutions.
|
||||
*
|
||||
* <p>
|
||||
* The result will not contain more than one haplotype with the same base sequence. The solution of the best
|
||||
* score is returned.
|
||||
* </p>
|
||||
* <p>
|
||||
* This makes sense when there are more than one possible path through the graph to create the same haplotype.
|
||||
* </p>
|
||||
* <p>
|
||||
* The resulting list is sorted by the score with more likely haplotype search results first.
|
||||
* </p>
|
||||
*
|
||||
* @return never {@code null}, perhaps an empty list.
|
||||
*/
|
||||
public List<KBestHaplotype> unique() {
|
||||
return unique(size());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,10 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Common interface for K-Best sub-haplotype finders.
|
||||
*
|
||||
|
|
@ -52,6 +56,29 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
|||
*/
|
||||
interface KBestSubHaplotypeFinder {
|
||||
|
||||
/**
|
||||
* Return an unique id for this sub-haplotype finder to be used when outputting diagrams.
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public String id();
|
||||
|
||||
/**
|
||||
* Returns a label with human readable representation of this finder.
|
||||
*
|
||||
* <p>This is used when generating a diagram to illustrate the search space and costs</p>
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public String label();
|
||||
|
||||
/**
|
||||
* Returns the set of subfinder from this finder together with a label for the connection with the current finder.
|
||||
*
|
||||
* <p>The label is used when generating a diagram to illustrate the search space and costs</p>
|
||||
*/
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder,String>> subFinderLabels();
|
||||
|
||||
/**
|
||||
* Returns the total number of possible sub-haplotypes.
|
||||
* @return 0 or greater.
|
||||
|
|
@ -67,5 +94,22 @@ interface KBestSubHaplotypeFinder {
|
|||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public abstract KBestHaplotype getKBest(int k);
|
||||
public KBestHaplotype getKBest(int k);
|
||||
|
||||
/**
|
||||
* Checks whether the top vertex for this finder is a reference haplotype vertex.
|
||||
*
|
||||
* @return {@code true} iff the top vertex for this finder is a reference vertex.
|
||||
*/
|
||||
public boolean isReference();
|
||||
|
||||
/**
|
||||
* Calculate the score of a sequence of bases.
|
||||
*
|
||||
* @param bases array containing the query base sequence.
|
||||
* @param offset first position of the query base sequence in {@code bases} .
|
||||
* @param length length of the query base sequence.
|
||||
* @return {@link Double#NaN} if there is no score for this sequence, otherwise a valid score value.
|
||||
*/
|
||||
public double score(byte[] bases, int offset, int length);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,20 +49,24 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
|||
import java.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* edge class for connecting nodes in the graph that tracks some per-sample information
|
||||
*
|
||||
* Edge class for connecting nodes in the graph that tracks some per-sample information.
|
||||
* <p>
|
||||
* This class extends BaseEdge with the additional functionality of tracking the maximum
|
||||
* multiplicity seen within any single sample. The workflow for using this class is:
|
||||
*
|
||||
* MultiSampleEdge e = new MultiSampleEdge(ref, 1)
|
||||
* e.incMultiplicity(1) // total is 2, per sample is 2, max per sample is 1
|
||||
* e.getPruningMultiplicity() // = 1
|
||||
* e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2
|
||||
* e.getPruningMultiplicity() // = 2
|
||||
* e.incMultiplicity(3) // total is 5, per sample is 3, max per sample is 2
|
||||
* e.getPruningMultiplicity() // = 2
|
||||
* e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3
|
||||
* e.getPruningMultiplicity() // = 3
|
||||
* </p>
|
||||
* <pre>
|
||||
* {@code
|
||||
* MultiSampleEdge e = new MultiSampleEdge(ref, 1)
|
||||
* e.incMultiplicity(1) // total is 2, per sample is 2, max per sample is 1
|
||||
* e.getPruningMultiplicity() // = 1
|
||||
* e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2
|
||||
* e.getPruningMultiplicity() // = 2
|
||||
* e.incMultiplicity(3) // total is 5, per sample is 3, max per sample is 2
|
||||
* e.getPruningMultiplicity() // = 2
|
||||
* e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3
|
||||
* e.getPruningMultiplicity() // = 3
|
||||
* }
|
||||
* </pre>
|
||||
*/
|
||||
public class MultiSampleEdge extends BaseEdge {
|
||||
private int currentSingleSampleMultiplicity;
|
||||
|
|
|
|||
|
|
@ -45,9 +45,10 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* General recursive sub-haplotype finder.
|
||||
|
|
@ -67,7 +68,11 @@ import java.util.Map;
|
|||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
||||
class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder<RecursiveSubHaplotypeFinder.EdgeSubHaplotypeFinder> {
|
||||
|
||||
|
||||
private final SeqVertex vertex;
|
||||
private final boolean isReference;
|
||||
|
||||
/**
|
||||
* Creates a recursive sub-haplotype finder give the target graph, first vertex and all possible outgoing edges
|
||||
|
|
@ -80,20 +85,83 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
* @param vertex first vertex for all sub-haplotype solutions provided by this finder
|
||||
* @param children map from outgoing edge to the corresponding sub-sub-haplotype finder.
|
||||
*/
|
||||
public RecursiveSubHaplotypeFinder(final SeqVertex vertex,
|
||||
public RecursiveSubHaplotypeFinder(final SeqGraph graph, final SeqVertex vertex,
|
||||
final Map<BaseEdge, KBestSubHaplotypeFinder> children) {
|
||||
super(createChildFinderCollection(vertex, children));
|
||||
this.vertex = vertex;
|
||||
this.isReference = graph.isReferenceNode(vertex);
|
||||
}
|
||||
|
||||
private static Collection<EdgeSubHaplotypeFinder> createChildFinderCollection(final SeqVertex vertex, final Map<BaseEdge,KBestSubHaplotypeFinder> finders) {
|
||||
/**
|
||||
* Wraps the descendant vertices finders in order to take advantage of the {@link AggregatedSubHaplotypeFinder}
|
||||
* common code.
|
||||
* <p>
|
||||
* Automatically calibrates the edge score (cost) so that it takes into account the total across all edges.
|
||||
* </p>
|
||||
*
|
||||
* @param vertex the parent vertex.
|
||||
* @param finders the child vertices indexed by the connecting edge.
|
||||
* @return never {@code null} but potentially an empty collection if there is child returning some sub-haplotype
|
||||
* solution.
|
||||
*/
|
||||
private static Collection<EdgeSubHaplotypeFinder> createChildFinderCollection(final SeqVertex vertex,
|
||||
final Map<BaseEdge,KBestSubHaplotypeFinder> finders) {
|
||||
if (finders == null) throw new IllegalArgumentException("the edge to child map cannot be null");
|
||||
final Collection<EdgeSubHaplotypeFinder> result = new ArrayList<>(finders.size());
|
||||
for (final Map.Entry<BaseEdge,KBestSubHaplotypeFinder> e : finders.entrySet())
|
||||
result.add(new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue()));
|
||||
final ArrayList<EdgeSubHaplotypeFinder> result = new ArrayList<>(finders.size());
|
||||
for (final Map.Entry<BaseEdge,KBestSubHaplotypeFinder> e : finders.entrySet()) {
|
||||
final EdgeSubHaplotypeFinder subFinder = new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue());
|
||||
if (subFinder.getCount() == 0) continue;
|
||||
result.add(subFinder);
|
||||
}
|
||||
if (result.size() == 0)
|
||||
return Collections.emptySet();
|
||||
else if (result.size() == 1) // no calibration needed, by default edgeScore is 0.
|
||||
return Collections.singleton(result.get(0));
|
||||
else {
|
||||
double totalEdgeMultiplicityAcrossEdges = 0;
|
||||
for (final EdgeSubHaplotypeFinder finder : result)
|
||||
totalEdgeMultiplicityAcrossEdges += Math.max(0.5,finder.edge.getMultiplicity());
|
||||
final double log10TotalEdgeMultiplicityAcrossEdges = Math.log10(totalEdgeMultiplicityAcrossEdges);
|
||||
for (final EdgeSubHaplotypeFinder finder : result)
|
||||
finder.calibrateEdgeScore(log10TotalEdgeMultiplicityAcrossEdges);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return isReference;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return vertex.getSequenceString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
final Set<Pair<? extends KBestSubHaplotypeFinder,String>> result = new LinkedHashSet<>(subFinders.size());
|
||||
for (final EdgeSubHaplotypeFinder subFinder : subFinders)
|
||||
result.add(new Pair<>(subFinder,simplifyZeros(String.format("%.4f", subFinder.edgeScore))));
|
||||
return result;
|
||||
}
|
||||
|
||||
private static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
||||
/**
|
||||
* Removes zeros decimal positions from edge-labels.
|
||||
*
|
||||
* @param edgeLabel the original label to reformat.
|
||||
* @return never {@code null}, the reformatted label.
|
||||
*/
|
||||
private String simplifyZeros(final String edgeLabel) {
|
||||
if (edgeLabel.equals("0.000") || edgeLabel.equals("-0.000") )
|
||||
return "0.";
|
||||
int i = edgeLabel.length() - 1;
|
||||
while (edgeLabel.charAt(i) == '0')
|
||||
i--;
|
||||
return (i == edgeLabel.length() - 1) ? edgeLabel : edgeLabel.substring(0,i);
|
||||
}
|
||||
|
||||
protected static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
||||
|
||||
private final KBestSubHaplotypeFinder childFinder;
|
||||
|
||||
|
|
@ -101,10 +169,32 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
|
||||
private final BaseEdge edge;
|
||||
|
||||
private double edgeScore = 0;
|
||||
|
||||
private EdgeSubHaplotypeFinder(final SeqVertex vertex, final BaseEdge edge, final KBestSubHaplotypeFinder childFinder) {
|
||||
this.childFinder = childFinder;
|
||||
this.edge = edge;
|
||||
this.vertex = vertex;
|
||||
this.edgeScore = 0;
|
||||
}
|
||||
|
||||
private void calibrateEdgeScore(final double log10TotalMultiplicityAcrossOutgoingEdges) {
|
||||
edgeScore = Math.log10(Math.max(edge.getMultiplicity(),0.5)) - log10TotalMultiplicityAcrossOutgoingEdges;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
return childFinder.id();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return childFinder.label();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
return childFinder.subFinderLabels();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -114,8 +204,31 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
|
||||
@Override
|
||||
public KBestHaplotype getKBest(int k) {
|
||||
return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k));
|
||||
return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k),edgeScore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return childFinder.isReference();
|
||||
}
|
||||
|
||||
@Override
|
||||
public double score(final byte[] bases, final int offset, final int length) {
|
||||
if (length == 0)
|
||||
return 0;
|
||||
final byte[] vertexSequence = vertex.getSequence();
|
||||
if (length < vertexSequence.length) // query is not long enough to have any score.
|
||||
return Double.NaN;
|
||||
else if (!Utils.equalRange(vertexSequence,0,bases,offset,vertexSequence.length))
|
||||
return Double.NaN;
|
||||
else
|
||||
return edgeScore + childFinder.score(bases,offset + vertexSequence.length,length - vertexSequence.length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
return "v" + vertex.getId();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -129,13 +242,14 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
*/
|
||||
private static class ChildKBestSubHaplotype extends KBestHaplotype {
|
||||
|
||||
private final int score;
|
||||
private final double score;
|
||||
private final KBestHaplotype child;
|
||||
private final SeqVertex vertex;
|
||||
private final boolean isReference;
|
||||
|
||||
public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child) {
|
||||
this.score = edge.getMultiplicity() + child.score();
|
||||
|
||||
public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child, final double edgeScore) {
|
||||
this.score = edgeScore + child.score();
|
||||
this.vertex = vertex;
|
||||
this.child = child;
|
||||
this.isReference = edge.isRef() && child.isReference();
|
||||
|
|
@ -147,7 +261,7 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int score() {
|
||||
public double score() {
|
||||
return score;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -291,16 +291,9 @@ public class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
final SeqVertex addedVertex = mergeLinearChainVertices(linearChain);
|
||||
addVertex(addedVertex);
|
||||
|
||||
final Set<BaseEdge> inEdges = incomingEdgesOf(first);
|
||||
final Set<BaseEdge> outEdges = outgoingEdgesOf(last);
|
||||
|
||||
final int nEdges = inEdges.size() + outEdges.size();
|
||||
int sharedWeightAmongEdges = nEdges == 0 ? 0 : sumEdgeWeightAlongChain(linearChain) / nEdges;
|
||||
final BaseEdge inc = new BaseEdge(false, sharedWeightAmongEdges); // template to make .add function call easy
|
||||
|
||||
// update the incoming and outgoing edges to point to the new vertex
|
||||
for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy().add(inc)); }
|
||||
for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy().add(inc)); }
|
||||
for( final BaseEdge edge : outgoingEdgesOf(last) ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy()); }
|
||||
for( final BaseEdge edge : incomingEdgesOf(first) ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy()); }
|
||||
|
||||
removeAllVertices(linearChain);
|
||||
return true;
|
||||
|
|
@ -313,29 +306,6 @@ public class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
return new SeqVertex( seqsCat );
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sum of the edge weights on a linear chain of at least 2 elements
|
||||
*
|
||||
* @param chain a linear chain of vertices with at least 2 vertices
|
||||
* @return the sum of the multiplicities along all edges connecting vertices within the chain
|
||||
*/
|
||||
@Requires({"chain != null", "chain.size() >= 2"})
|
||||
private int sumEdgeWeightAlongChain(final LinkedList<SeqVertex> chain) {
|
||||
int sum = 0;
|
||||
SeqVertex prev = null;
|
||||
|
||||
for ( final SeqVertex v : chain ) {
|
||||
if ( prev != null ) {
|
||||
final BaseEdge e = getEdge(prev, v);
|
||||
if ( e == null ) throw new IllegalStateException("Something wrong with the linear chain, got a null edge between " + prev + " and " + v);
|
||||
sum += e.getMultiplicity();
|
||||
}
|
||||
prev = v;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Base class for transformation operations that need to iterate over proposed vertices, where
|
||||
* each proposed vertex is a seed vertex for a potential transformation.
|
||||
|
|
|
|||
|
|
@ -247,12 +247,12 @@ public class SharedVertexSequenceSplitter {
|
|||
|
||||
if ( needPrefixNode ) {
|
||||
outer.addVertex(prefixV);
|
||||
if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0));
|
||||
if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 1));
|
||||
}
|
||||
|
||||
if ( needSuffixNode ) {
|
||||
outer.addVertex(suffixV);
|
||||
if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0));
|
||||
if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 1));
|
||||
}
|
||||
|
||||
if ( topForConnect != null ) {
|
||||
|
|
|
|||
|
|
@ -52,7 +52,6 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
|||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.jgrapht.alg.CycleDetector;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
|
@ -88,8 +87,7 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
|
|||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
final boolean debugGraphTransformations;
|
||||
private final boolean debugGraphTransformations;
|
||||
final byte minBaseQualityToUseInAssembly;
|
||||
|
||||
protected boolean increaseCountsBackwards = true;
|
||||
|
|
@ -319,13 +317,6 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
|
|||
removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if the graph has cycles, false otherwise
|
||||
*/
|
||||
public boolean hasCycles() {
|
||||
return new CycleDetector<>(this).detectCycles();
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the graph not have enough complexity? We define low complexity as a situation where the number
|
||||
* of non-unique kmers is more than 20% of the total number of kmers.
|
||||
|
|
@ -419,39 +410,10 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
|
|||
return counter.getKmersWithCountsAtLeast(2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert this kmer graph to a simple sequence graph.
|
||||
*
|
||||
* Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
|
||||
* graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
|
||||
*
|
||||
* @return a newly allocated SequenceGraph
|
||||
*/
|
||||
// TODO -- should override base class method
|
||||
@Override
|
||||
public SeqGraph convertToSequenceGraph() {
|
||||
buildGraphIfNecessary();
|
||||
|
||||
final SeqGraph seqGraph = new SeqGraph(kmerSize);
|
||||
final Map<MultiDeBruijnVertex, SeqVertex> vertexMap = new HashMap<>();
|
||||
|
||||
|
||||
// create all of the equivalent seq graph vertices
|
||||
for ( final MultiDeBruijnVertex dv : vertexSet() ) {
|
||||
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
|
||||
sv.setAdditionalInfo(dv.additionalInfo());
|
||||
vertexMap.put(dv, sv);
|
||||
seqGraph.addVertex(sv);
|
||||
}
|
||||
|
||||
// walk through the nodes and connect them to their equivalent seq vertices
|
||||
for( final MultiSampleEdge e : edgeSet() ) {
|
||||
final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
|
||||
final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
|
||||
//logger.info("Adding edge " + seqInV + " -> " + seqOutV);
|
||||
seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity()));
|
||||
}
|
||||
|
||||
return seqGraph;
|
||||
return super.convertToSequenceGraph();
|
||||
}
|
||||
|
||||
private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers,
|
||||
|
|
@ -749,15 +711,15 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
|
|||
}
|
||||
|
||||
private static String pathElementId(final String element) {
|
||||
final int parentesysPos = element.indexOf('(');
|
||||
final int openBracketPosition = element.indexOf('(');
|
||||
|
||||
if (parentesysPos == -1)
|
||||
if (openBracketPosition == -1)
|
||||
return null;
|
||||
|
||||
final int closeParentesysPos = element.lastIndexOf(')');
|
||||
if (closeParentesysPos == -1)
|
||||
final int closeBracketPosition = element.lastIndexOf(')');
|
||||
if (closeBracketPosition == -1)
|
||||
throw new IllegalArgumentException("non-closed id parantesys found in element: " + element);
|
||||
final String result = element.substring(parentesysPos + 1,closeParentesysPos).trim();
|
||||
final String result = element.substring(openBracketPosition + 1,closeBracketPosition).trim();
|
||||
if (result.isEmpty())
|
||||
throw new IllegalArgumentException("empty id found in element: " + element);
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"f50e0b35e2240b19b1b8b6dfa0cf9796");
|
||||
"5ac3bfe1da1d411b52a98ef3debbd318");
|
||||
}
|
||||
|
||||
private void HCTestComplexConsensusMode(String bam, String args, String md5) {
|
||||
|
|
@ -106,7 +106,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleConsensusModeComplex() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337",
|
||||
"21e521d51b826450d348e5201684ffe4");
|
||||
"61972c7c0d378e756f3b4d99aed9d0cf");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,8 +68,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
|||
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50323a284788c8220c9226037c7003b5"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7c16aa8e35de9f418533efac3bae6551"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "7e1e193d70187774f9740d475e0f1cc1"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "96fea2caf0a40df3feb268e8b14da670"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "19efc8020f31d1b68d80c50df0629e50"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "d926d653500a970280ad7828d9ee2b84"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "83ddc16e4f0900429b2da30e582994aa"});
|
||||
|
|
|
|||
|
|
@ -227,7 +227,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestDBSNPAnnotationWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||
Arrays.asList("0998be22d7af4372247f5a0338f9446b"));
|
||||
Arrays.asList("7c3254ead383e2b9a51b242f6de2a5b2"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestDBSNPAnnotationWGSGraphBased() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||
Arrays.asList("1aeed297a3cb41940d83eac499a2ce07"));
|
||||
Arrays.asList("eda8f91091fe462205d687ec49fc61e7"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -276,7 +276,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestAggressivePcrIndelModelWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
|
||||
Arrays.asList("f426f4c2986e1dea8f3f55951ef8e013"));
|
||||
Arrays.asList("73c52372a1a80f052ea2b728ee17bf22"));
|
||||
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestConservativePcrIndelModelWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
|
||||
Arrays.asList("dcb38cb9280f2c3059a09d323db1c633"));
|
||||
Arrays.asList("4e10d49b8af23d5ef3a28cb702d10a4b"));
|
||||
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -298,4 +298,25 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
spec.disableShadowBCF();
|
||||
executeTest("testGraphBasedNoSuchEdgeBugFix", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLackSensitivityDueToBadHaplotypeSelectionFix() {
|
||||
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ",
|
||||
b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list",
|
||||
HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e2e6647f7c96e91aeead7301017dc800"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadLikelihoodsDueToBadHaplotypeSelectionFix() {
|
||||
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ",
|
||||
hg19RefereneWithChrPrefixInChromosomeNames, privateTestDir + "bad-likelihoods.bam", privateTestDir + "bad-likelihoods.interval_list",
|
||||
HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbda30145523bf05e0413157f1a00b3e"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testBadLikelihoodsDueToBadHaplotypeSelectionFix", spec);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
|
@ -61,7 +60,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
|
|||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
for ( final int nct : Arrays.asList(1, 2, 4) ) {
|
||||
tests.add(new Object[]{nct, "1f463bf3a06c401006858bc446ecea54"});
|
||||
tests.add(new Object[]{nct, "fd9324a574f9204f7308fc1af422fdcc"});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
|
|
|
|||
|
|
@ -52,10 +52,11 @@ import org.testng.annotations.DataProvider;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
public class CommonSuffixMergerUnitTest extends BaseTest {
|
||||
private final static boolean PRINT_GRAPHS = true;
|
||||
private final static boolean PRINT_GRAPHS = false;
|
||||
|
||||
@DataProvider(name = "CompleteCycleData")
|
||||
public Object[][] makeCompleteCycleData() {
|
||||
|
|
@ -134,11 +135,35 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
|
|||
return toUse.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares KBestHaplotype solutions, first by the haplotype base sequence and the by their score.
|
||||
*/
|
||||
private static final Comparator<KBestHaplotype> KBESTHAPLOTYPE_COMPARATOR = new Comparator<KBestHaplotype>() {
|
||||
|
||||
/**
|
||||
* Compares KBestHaplotype solutions, first by the haplotype base sequence and the by their score.
|
||||
*
|
||||
* @return {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public int compare(final KBestHaplotype o1,final KBestHaplotype o2) {
|
||||
final int baseCmp = o1.haplotype().getBaseString().compareTo(o2.haplotype().getBaseString());
|
||||
if (baseCmp != 0)
|
||||
return baseCmp;
|
||||
return - Double.compare(o1.score(),o2.score());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) {
|
||||
final KBestHaplotypeFinder originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks());
|
||||
final KBestHaplotypeFinder actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks());
|
||||
final List<KBestHaplotype> sortedOriginalKBestHaplotypes = new ArrayList<>(originalKBestHaplotypes);
|
||||
Collections.sort(sortedOriginalKBestHaplotypes, KBESTHAPLOTYPE_COMPARATOR);
|
||||
final List<KBestHaplotype> sortedActualKBestHaplotypes = new ArrayList<>(actualKBestHaplotypes);
|
||||
Collections.sort(sortedActualKBestHaplotypes, KBESTHAPLOTYPE_COMPARATOR);
|
||||
try {
|
||||
final Set<String> haplotypes = new HashSet<String>();
|
||||
final List<KBestHaplotype> originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks());
|
||||
final List<KBestHaplotype> actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks());
|
||||
|
||||
for (final KBestHaplotype kbh : originalKBestHaplotypes)
|
||||
haplotypes.add(new String(kbh.bases()));
|
||||
|
|
@ -148,14 +173,16 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
|
|||
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
|
||||
}
|
||||
|
||||
if ( actualKBestHaplotypes.size() == originalKBestHaplotypes.size() ) {
|
||||
for ( int i = 0; i < originalKBestHaplotypes.size(); i++ ) {
|
||||
Assert.assertTrue(actualKBestHaplotypes.get(i).haplotype().getBaseString().equals(originalKBestHaplotypes.get(i).haplotype().getBaseString()), "Paths not equal " + actualKBestHaplotypes.get(i).haplotype() + " vs. original " + originalKBestHaplotypes.get(i).haplotype());
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(sortedActualKBestHaplotypes,sortedOriginalKBestHaplotypes);
|
||||
} catch ( AssertionError e ) {
|
||||
if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0);
|
||||
if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0);
|
||||
try {
|
||||
if ( PRINT_GRAPHS ) originalKBestHaplotypes.printDOTFile(String.format("%s.original.finder.dot",name));
|
||||
if ( PRINT_GRAPHS ) actualKBestHaplotypes.printDOTFile(String.format("%s.actual.finder.dot",name));
|
||||
} catch (IOException e2) {
|
||||
// do nothing.
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,7 +65,6 @@ import java.util.*;
|
|||
* User: rpoplin
|
||||
* Date: 1/31/13
|
||||
*/
|
||||
|
||||
public class KBestHaplotypeFinderUnitTest extends BaseTest {
|
||||
|
||||
@DataProvider(name = "BasicPathFindingData")
|
||||
|
|
@ -113,11 +112,11 @@ public class KBestHaplotypeFinderUnitTest extends BaseTest {
|
|||
final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * nEndNodes;
|
||||
Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths");
|
||||
|
||||
int lastScore = Integer.MAX_VALUE;
|
||||
double lastScore = 0;
|
||||
for ( final KBestHaplotype kbh : paths ) {
|
||||
final Path<SeqVertex,BaseEdge> path = kbh.path();
|
||||
Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order. Path " + path + " has score above previous " + lastScore);
|
||||
lastScore = path.getScore();
|
||||
Assert.assertTrue(kbh.score() <= lastScore, "Paths out of order. Path " + path + " has score " + path.getScore() + " above previous " + lastScore);
|
||||
lastScore = kbh.score();
|
||||
}
|
||||
|
||||
// get the best path, and make sure it's the same as our optimal path overall
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -226,28 +227,34 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
final Set<String> haplotypes = new HashSet<>();
|
||||
final List<KBestHaplotype> originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks());
|
||||
final KBestHaplotypeFinder originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks());
|
||||
for ( final KBestHaplotype path : originalPaths )
|
||||
haplotypes.add(new String(path.bases()));
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
|
||||
splitter.split();
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0);
|
||||
if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0);
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".original.dot"), 0);
|
||||
if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".split.dot"), 0);
|
||||
splitter.updateGraph(top, bot);
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0);
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".updated.dot"), 0);
|
||||
|
||||
final List<KBestHaplotype> splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks());
|
||||
final KBestHaplotypeFinder splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks());
|
||||
for ( final KBestHaplotype path : splitPaths ) {
|
||||
final String h = new String(path.bases());
|
||||
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
|
||||
}
|
||||
|
||||
if ( splitPaths.size() == originalPaths.size() ) {
|
||||
for ( int i = 0; i < originalPaths.size(); i++ ) {
|
||||
Assert.assertTrue(splitPaths.get(i).path().equalScoreAndSequence(originalPaths.get(i).path()), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
final List<byte[]> sortedOriginalPaths = new ArrayList<>(originalPaths.size());
|
||||
for (final KBestHaplotype kbh : originalPaths.unique())
|
||||
sortedOriginalPaths.add(kbh.bases());
|
||||
Collections.sort(sortedOriginalPaths, BaseUtils.BASES_COMPARATOR);
|
||||
final List<byte[]> sortedSplitPaths = new ArrayList<>(splitPaths.size());
|
||||
for (final KBestHaplotype kbh : splitPaths.unique())
|
||||
sortedSplitPaths.add(kbh.bases());
|
||||
Collections.sort(sortedSplitPaths, BaseUtils.BASES_COMPARATOR);
|
||||
|
||||
Assert.assertEquals(sortedSplitPaths,sortedOriginalPaths,Utils.join("_", strings) + "_" + hasTop + "_" + hasBot);
|
||||
}
|
||||
|
||||
@DataProvider(name = "MeetsMinSequenceData")
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
||||
|
|
@ -589,4 +590,26 @@ public class BaseUtils {
|
|||
throw new ReviewedStingException("base must be A, C, G or T. " + (char) base + " is not a valid base.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Lexicographical sorting of base arrays {@link Comparator}.
|
||||
*/
|
||||
public static final Comparator<byte[]> BASES_COMPARATOR = new Comparator<byte[]> (){
|
||||
|
||||
@Override
|
||||
public int compare(final byte[] o1,final byte[] o2) {
|
||||
final int minLength = Math.min(o1.length,o2.length);
|
||||
for (int i = 0; i < minLength; i++) {
|
||||
final int cmp = Byte.compare(o1[i],o2[i]);
|
||||
if (cmp != 0) return cmp;
|
||||
}
|
||||
if (o1.length == o2.length)
|
||||
return 0;
|
||||
else if (o1.length == minLength)
|
||||
return -1;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -852,4 +852,34 @@ public class Utils {
|
|||
|
||||
return lst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares sections from to byte arrays to verify whether they contain the same values.
|
||||
*
|
||||
* @param left first array to compare.
|
||||
* @param leftOffset first position of the first array to compare.
|
||||
* @param right second array to compare.
|
||||
* @param rightOffset first position of the second array to compare.
|
||||
* @param length number of positions to compare.
|
||||
*
|
||||
* @throws IllegalArgumentException if <ul>
|
||||
* <li>either {@code left} or {@code right} is {@code null} or</li>
|
||||
* <li>any off the offset or length combine point outside any of the two arrays</li>
|
||||
* </ul>
|
||||
* @return {@code true} iff {@code length} is 0 or all the bytes in both ranges are the same two-by-two.
|
||||
*/
|
||||
public static boolean equalRange(final byte[] left, final int leftOffset, byte[] right, final int rightOffset, final int length) {
|
||||
if (left == null) throw new IllegalArgumentException("left cannot be null");
|
||||
if (right == null) throw new IllegalArgumentException("right cannot be null");
|
||||
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
|
||||
if (leftOffset < 0) throw new IllegalArgumentException("left offset cannot be negative");
|
||||
if (leftOffset + length > left.length) throw new IllegalArgumentException("length goes beyond end of left array");
|
||||
if (rightOffset < 0) throw new IllegalArgumentException("right offset cannot be negative");
|
||||
if (rightOffset + length > right.length) throw new IllegalArgumentException("length goes beyond end of right array");
|
||||
|
||||
for (int i = 0; i < length; i++)
|
||||
if (left[leftOffset + i] != right[rightOffset + i])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ public class Haplotype extends Allele {
|
|||
private EventMap eventMap = null;
|
||||
private Cigar cigar;
|
||||
private int alignmentStartHapwrtRef;
|
||||
private double score = 0;
|
||||
private double score = Double.NaN;
|
||||
|
||||
/**
|
||||
* Main constructor
|
||||
|
|
@ -301,7 +301,7 @@ public class Haplotype extends Allele {
|
|||
* @return a double, where higher values are better
|
||||
*/
|
||||
public double getScore() {
|
||||
return this.isReference() ? Double.MAX_VALUE : score;
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -312,7 +312,7 @@ public class Haplotype extends Allele {
|
|||
* @param score a double, where higher values are better
|
||||
*/
|
||||
public void setScore(double score) {
|
||||
this.score = this.isReference() ? Double.MAX_VALUE : score;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -91,6 +91,7 @@ public abstract class BaseTest {
|
|||
//public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta";
|
||||
public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta";
|
||||
public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta";
|
||||
public static final String hg19RefereneWithChrPrefixInChromosomeNames = "/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta";
|
||||
public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/";
|
||||
public static final String validationDataLocation = GATKDataLocation + "Validation_Data/";
|
||||
public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/";
|
||||
|
|
|
|||
|
|
@ -26,9 +26,16 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Random;
|
||||
|
||||
|
||||
public class BaseUtilsUnitTest extends BaseTest {
|
||||
|
|
@ -123,4 +130,50 @@ public class BaseUtilsUnitTest extends BaseTest {
|
|||
|
||||
Assert.assertTrue(rcObs.equals(rcExp));
|
||||
}
|
||||
|
||||
@Test(dataProvider="baseComparatorData")
|
||||
public void testBaseComparator(final Collection<byte[]> basesToSort) {
|
||||
final ArrayList<byte[]> sorted = new ArrayList<>(basesToSort);
|
||||
Collections.sort(sorted, BaseUtils.BASES_COMPARATOR);
|
||||
for (int i = 0; i < sorted.size(); i++) {
|
||||
Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(i)),0);
|
||||
final String iString = new String(sorted.get(i));
|
||||
for (int j = i; j < sorted.size(); j++) {
|
||||
final String jString = new String(sorted.get(j));
|
||||
if (iString.compareTo(jString) == 0)
|
||||
Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)),0);
|
||||
else
|
||||
Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) * iString.compareTo(jString) > 0);
|
||||
Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) <= 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name="baseComparatorData")
|
||||
public Object[][] baseComparatorData() {
|
||||
final int testCount = 10;
|
||||
final int testSizeAverage = 10;
|
||||
final int testSizeDeviation = 10;
|
||||
final int haplotypeSizeAverage = 100;
|
||||
final int haplotypeSizeDeviation = 100;
|
||||
|
||||
final Object[][] result = new Object[testCount][];
|
||||
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
|
||||
|
||||
for (int i = 0; i < testCount; i++) {
|
||||
final int size = (int) Math.max(0,rnd.nextDouble() * testSizeDeviation + testSizeAverage);
|
||||
final ArrayList<byte[]> bases = new ArrayList<>(size);
|
||||
for (int j = 0; j < size; j++) {
|
||||
final int jSize = (int) Math.max(0,rnd.nextDouble() * haplotypeSizeDeviation + haplotypeSizeAverage);
|
||||
final byte[] b = new byte[jSize];
|
||||
for (int k = 0; k < jSize; k++)
|
||||
b[k] = BaseUtils.baseIndexToSimpleBase(rnd.nextInt(4));
|
||||
bases.add(b);
|
||||
}
|
||||
result[i] = new Object[] { bases };
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -236,4 +236,23 @@ public class UtilsUnitTest extends BaseTest {
|
|||
public void testTrim(final String s, final int frontTrim, final int backTrim) {
|
||||
Assert.assertEquals(s.length() - frontTrim - backTrim, Utils.trimArray(s.getBytes(), frontTrim, backTrim).length);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "equalRange", enabled = true)
|
||||
public void testEqualRange(final byte[] array1, final byte[] array2, final int offset1, final int offset2, final int length, final boolean expected) {
|
||||
Assert.assertEquals(Utils.equalRange(array1,offset1,array2,offset2,length),expected);
|
||||
Assert.assertTrue(Utils.equalRange(array1,offset1,array1,offset1,length));
|
||||
Assert.assertTrue(Utils.equalRange(array2,offset2,array2,offset2,length));
|
||||
|
||||
}
|
||||
|
||||
@DataProvider(name = "equalRangeData")
|
||||
public Object[][] equalRangeData() {
|
||||
return new Object[][] {
|
||||
new Object[] { new byte[0] , new byte[0], 0, 0, 0, true},
|
||||
new Object[] { "ABCF".getBytes(), "BC".getBytes(), 1,0,2, true },
|
||||
new Object[] { "ABCF".getBytes(), "".getBytes(), 1,0,0, true },
|
||||
new Object[] { "ABCF".getBytes(), "ACBF".getBytes(), 0,0, 4, false}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue