Merge pull request #559 from broadinstitute/vrr_assembly_graph_edge_info_revise
Improved criteria to select best haplotypes out from the assembly graph.
This commit is contained in:
commit
c1b4390691
|
|
@ -213,16 +213,18 @@ public abstract class LocalAssemblyEngine {
|
|||
final Map<SeqGraph,AssemblyResult> assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) {
|
||||
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
|
||||
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
|
||||
returnHaplotypes.add( refHaplotype );
|
||||
|
||||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
final ArrayList<KBestHaplotypeFinder> finders = new ArrayList<>(graphs.size());
|
||||
|
||||
for( final SeqGraph graph : graphs ) {
|
||||
final SeqVertex source = graph.getReferenceSourceVertex();
|
||||
final SeqVertex sink = graph.getReferenceSinkVertex();
|
||||
if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
|
||||
final KBestHaplotypeFinder haplotypeFinder = new KBestHaplotypeFinder(graph,source,sink);
|
||||
finders.add(haplotypeFinder);
|
||||
final Iterator<KBestHaplotype> bestHaplotypes = haplotypeFinder.iterator(numBestHaplotypesPerGraph);
|
||||
|
||||
while (bestHaplotypes.hasNext()) {
|
||||
final KBestHaplotype kBestHaplotype = bestHaplotypes.next();
|
||||
final Haplotype h = kBestHaplotype.haplotype();
|
||||
|
|
@ -256,9 +258,19 @@ public abstract class LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
if ( returnHaplotypes.size() < returnHaplotypes.size() )
|
||||
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
|
||||
// Make sure that the ref haplotype is amongst the return haplotypes and calculate its score as
|
||||
// the first returned by any finder.
|
||||
if (!returnHaplotypes.contains(refHaplotype)) {
|
||||
double refScore = Double.NaN;
|
||||
for (final KBestHaplotypeFinder finder : finders) {
|
||||
final double candidate = finder.score(refHaplotype);
|
||||
if (Double.isNaN(candidate)) continue;
|
||||
refScore = candidate;
|
||||
break;
|
||||
}
|
||||
refHaplotype.setScore(refScore);
|
||||
returnHaplotypes.add(refHaplotype);
|
||||
}
|
||||
|
||||
if( debug ) {
|
||||
if( returnHaplotypes.size() > 1 ) {
|
||||
|
|
|
|||
|
|
@ -45,21 +45,21 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.PriorityQueue;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* K-best sub-haplotype finder that selects the best solutions out of a collection of sub-haplotype finders.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
||||
class AggregatedSubHaplotypeFinder<F extends KBestSubHaplotypeFinder> implements KBestSubHaplotypeFinder {
|
||||
|
||||
/**
|
||||
* Collection of subFinders that provided the actual solutions.
|
||||
*/
|
||||
private final Collection<? extends KBestSubHaplotypeFinder> subFinders;
|
||||
protected final Collection<F> subFinders;
|
||||
|
||||
/**
|
||||
* Flag indicating whether the sub-finders have been processed or not.
|
||||
|
|
@ -89,17 +89,53 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
* Creates a new aggregated sub-haplotype finder given its sub-finders.
|
||||
* @param finders set of sub-finders.
|
||||
*/
|
||||
public AggregatedSubHaplotypeFinder(final Collection<? extends KBestSubHaplotypeFinder> finders) {
|
||||
public AggregatedSubHaplotypeFinder(final Collection<F> finders) {
|
||||
if (finders == null) throw new IllegalArgumentException("finder collection cannot be null");
|
||||
this.subFinders = finders;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
final StringBuilder resultBuilder = new StringBuilder();
|
||||
for (final KBestSubHaplotypeFinder subFinder : subFinders)
|
||||
resultBuilder.append(subFinder.id());
|
||||
return resultBuilder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return "<OR>";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
final int subFinderCount = subFinders.size();
|
||||
final String edgeCost = String.format("%.2f",-Math.log10((double) subFinderCount));
|
||||
final Set<Pair<? extends KBestSubHaplotypeFinder,String>> result = new LinkedHashSet<>(subFinderCount);
|
||||
for (final KBestSubHaplotypeFinder subFinder : subFinders)
|
||||
result.add(new Pair<>(subFinder,edgeCost));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
processSubFindersIfNeeded();
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double score(final byte[] bases, final int offset, final int length) {
|
||||
if (bases == null) throw new IllegalArgumentException("bases cannot be null");
|
||||
if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative");
|
||||
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
|
||||
if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size");
|
||||
for (final KBestSubHaplotypeFinder subFinder : subFinders) {
|
||||
final double score = subFinder.score(bases,offset,length);
|
||||
if (!Double.isNaN(score)) return score;
|
||||
}
|
||||
return Double.NaN;
|
||||
}
|
||||
|
||||
private void processSubFindersIfNeeded() {
|
||||
if (processedSubFinders) return;
|
||||
|
||||
|
|
@ -144,6 +180,11 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
return rankedSubHaplotype.get(k);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Custom implementation of {@link KBestHaplotype} to encapsulate sub-finder results.
|
||||
*/
|
||||
|
|
@ -167,7 +208,7 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int score() {
|
||||
public double score() {
|
||||
return result.score();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import com.google.java.contract.Requires;
|
|||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.jgrapht.EdgeFactory;
|
||||
import org.jgrapht.alg.CycleDetector;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -146,6 +147,39 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert this kmer graph to a simple sequence graph.
|
||||
*
|
||||
* Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
|
||||
* graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
|
||||
*
|
||||
* @return a newly allocated SequenceGraph
|
||||
*/
|
||||
public SeqGraph convertToSequenceGraph() {
|
||||
|
||||
final SeqGraph seqGraph = new SeqGraph(kmerSize);
|
||||
final Map<V, SeqVertex> vertexMap = new HashMap<>();
|
||||
|
||||
|
||||
// create all of the equivalent seq graph vertices
|
||||
for ( final V dv : vertexSet() ) {
|
||||
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
|
||||
sv.setAdditionalInfo(dv.additionalInfo());
|
||||
vertexMap.put(dv, sv);
|
||||
seqGraph.addVertex(sv);
|
||||
}
|
||||
|
||||
// walk through the nodes and connect them to their equivalent seq vertices
|
||||
for( final E e : edgeSet() ) {
|
||||
final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
|
||||
final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
|
||||
//logger.info("Adding edge " + seqInV + " -> " + seqOutV);
|
||||
seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity()));
|
||||
}
|
||||
|
||||
return seqGraph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull out the additional sequence implied by traversing this node in the graph
|
||||
* @param v the vertex from which to pull out the additional base sequence
|
||||
|
|
@ -712,4 +746,13 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
if (!containsVertex(vertex)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks for the presence of directed cycles in the graph.
|
||||
*
|
||||
* @return {@code true} if the graph has cycles, {@code false} otherwise.
|
||||
*/
|
||||
public boolean hasCycles() {
|
||||
return new CycleDetector<>(this).detectCycles();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ public class CommonSuffixSplitter {
|
|||
} else {
|
||||
incomingTarget = prefixV;
|
||||
graph.addVertex(prefixV);
|
||||
graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0));
|
||||
graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 1));
|
||||
edgesToRemove.add(out);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -45,6 +45,11 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Represents a trivial k-best sub haplotype finder with no solutions.
|
||||
*
|
||||
|
|
@ -65,6 +70,21 @@ final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
protected DeadEndKBestSubHaplotypeFinder() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
return "<DEAD>";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return "<DEAD>";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
return 0;
|
||||
|
|
@ -77,4 +97,18 @@ final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
|||
else
|
||||
throw new IllegalArgumentException("k cannot be equal or greater to the haplotype count");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double score(final byte[] bases, final int offset, final int length) {
|
||||
if (bases == null) throw new IllegalArgumentException("bases cannot be null");
|
||||
if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative");
|
||||
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
|
||||
if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size");
|
||||
return Double.NaN;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,12 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Trivial k-best sub-haplotype finder where the source and sink vertex are the same one.
|
||||
*
|
||||
|
|
@ -67,6 +73,21 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder {
|
|||
singleHaplotypePath = new MyBestHaplotypePath(graph,vertex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
return "v" + singleHaplotypePath.head().getId();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return singleHaplotypePath.head().getSequenceString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
return 1;
|
||||
|
|
@ -81,6 +102,24 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder {
|
|||
return singleHaplotypePath;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return singleHaplotypePath.isReference();
|
||||
}
|
||||
|
||||
@Override
|
||||
public double score(final byte[] bases, final int offset, final int length) {
|
||||
if (bases == null) throw new IllegalArgumentException("bases cannot be null");
|
||||
if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative");
|
||||
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
|
||||
if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size");
|
||||
final byte[] vertexBases = singleHaplotypePath.head().getSequence();
|
||||
if (length != vertexBases.length)
|
||||
return Double.NaN;
|
||||
else
|
||||
return Utils.equalRange(bases, offset, vertexBases, 0, length)? 0 : Double.NaN;
|
||||
}
|
||||
|
||||
/**
|
||||
* Custom extension of {@link KBestHaplotype} that implements the single solution behaviour.
|
||||
*/
|
||||
|
|
@ -120,7 +159,7 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int score() {
|
||||
public double score() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ public abstract class KBestHaplotype implements Comparable<KBestHaplotype> {
|
|||
*
|
||||
* @return 0 or greater.
|
||||
*/
|
||||
public abstract int score();
|
||||
public abstract double score();
|
||||
|
||||
/**
|
||||
* Indicates whether this result is the reference haplotype.
|
||||
|
|
@ -122,6 +122,8 @@ public abstract class KBestHaplotype implements Comparable<KBestHaplotype> {
|
|||
public Haplotype haplotype() {
|
||||
if (haplotype != null) return haplotype;
|
||||
haplotype = new Haplotype(bases(),isReference());
|
||||
if (score() > 0)
|
||||
throw new IllegalStateException("score cannot be greater than 0: " + score());
|
||||
haplotype.setScore(score());
|
||||
return haplotype;
|
||||
}
|
||||
|
|
@ -152,7 +154,35 @@ public abstract class KBestHaplotype implements Comparable<KBestHaplotype> {
|
|||
*/
|
||||
public int compareTo(final KBestHaplotype other) {
|
||||
if (other == null) throw new IllegalArgumentException("the other object cannot be null");
|
||||
return - 1 * (score() - other.score());
|
||||
return - Double.compare(score(), other.score());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return haplotype().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
return other == null ? false: (other instanceof KBestHaplotype ? equals((KBestHaplotype)other) : false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return haplotype().toString() + " Score = " + score();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether both solutions are equal.
|
||||
* <p>
|
||||
* Both solutions are considered equal when the underlying haplotypes are equal. The path on the respective
|
||||
* graph might deffer though.
|
||||
* </p>
|
||||
*
|
||||
* @return {@code true} iff both haplotypes are the same (considering the ref state).
|
||||
*/
|
||||
protected boolean equals(final KBestHaplotype other) {
|
||||
return haplotype().equals(other.haplotype(),false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -45,8 +45,13 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.jgrapht.alg.CycleDetector;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -233,7 +238,7 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
|
|||
}
|
||||
|
||||
@Override
|
||||
public KBestHaplotype get(int index) {
|
||||
public KBestHaplotype get(final int index) {
|
||||
if (index < 0 || index >= size())
|
||||
throw new IndexOutOfBoundsException();
|
||||
return topFinder.getKBest(index);
|
||||
|
|
@ -305,28 +310,28 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
|
|||
/**
|
||||
* Creates a finder from a vertex.
|
||||
*
|
||||
* @param source the source vertex for the finder.
|
||||
* @param vertex the source vertex for the finder.
|
||||
*
|
||||
* @return never {@code null}, perhaps a finder that returns no haplotypes though.
|
||||
*/
|
||||
protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex source) {
|
||||
KBestSubHaplotypeFinder node = finderByVertex.get(source);
|
||||
if (node == null) {
|
||||
if (sinks.contains(source))
|
||||
node = new EmptyPathHaplotypeFinderNode(graph,source);
|
||||
protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex vertex) {
|
||||
KBestSubHaplotypeFinder finder = finderByVertex.get(vertex);
|
||||
if (finder == null) {
|
||||
if (sinks.contains(vertex))
|
||||
finder = new EmptyPathHaplotypeFinderNode(graph,vertex);
|
||||
else {
|
||||
final Set<BaseEdge> outgoingEdges = graph.outgoingEdgesOf(source);
|
||||
final Set<BaseEdge> outgoingEdges = graph.outgoingEdgesOf(vertex);
|
||||
if (outgoingEdges.isEmpty())
|
||||
node = DeadEndKBestSubHaplotypeFinder.INSTANCE;
|
||||
finder = DeadEndKBestSubHaplotypeFinder.INSTANCE;
|
||||
else {
|
||||
final Map<BaseEdge,KBestSubHaplotypeFinder> undeadChildren = createChildrenFinders(outgoingEdges);
|
||||
node = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE :
|
||||
new RecursiveSubHaplotypeFinder(source,undeadChildren);
|
||||
finder = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE :
|
||||
new RecursiveSubHaplotypeFinder(graph,vertex,undeadChildren);
|
||||
}
|
||||
}
|
||||
finderByVertex.put(source, node);
|
||||
finderByVertex.put(vertex, finder);
|
||||
}
|
||||
return node;
|
||||
return finder;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -340,7 +345,7 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
|
|||
* @return never {@code null}, perhaps an empty map if there is no children with valid paths to any sink for this
|
||||
* finder.
|
||||
*/
|
||||
private Map<BaseEdge, KBestSubHaplotypeFinder> createChildrenFinders(Set<BaseEdge> baseEdges) {
|
||||
private Map<BaseEdge, KBestSubHaplotypeFinder> createChildrenFinders(final Set<BaseEdge> baseEdges) {
|
||||
final Map<BaseEdge,KBestSubHaplotypeFinder> result = new LinkedHashMap<>(baseEdges.size());
|
||||
for (final BaseEdge edge : baseEdges) {
|
||||
final KBestSubHaplotypeFinder targetFinder = createVertexFinder(graph.getEdgeTarget(edge));
|
||||
|
|
@ -349,4 +354,156 @@ public class KBestHaplotypeFinder extends AbstractList<KBestHaplotype> implement
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print a DOT representation of search graph.
|
||||
*
|
||||
* @param out character stream printer where to print the DOT representation to.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code out} is {@code null}.
|
||||
*/
|
||||
public void printDOT(final PrintWriter out) {
|
||||
if (out == null)
|
||||
throw new IllegalArgumentException("the out writer cannot be null");
|
||||
out.println("digraph {");
|
||||
out.println(" rankdir = LR");
|
||||
out.println(" node [shape=box, margin=0.01]");
|
||||
out.println(" subgraph cluster_dummy { style = invis; x [label=\"\",shape=none,margin=0] }");
|
||||
final StringBuilder referenceCluster = new StringBuilder(1000);
|
||||
|
||||
referenceCluster.append(" subgraph cluster_ref {\n");
|
||||
referenceCluster.append(" node [penwidth=2]\n");
|
||||
for (final KBestSubHaplotypeFinder finder : finderByVertex.values() ) {
|
||||
final String id = finder.id();
|
||||
final String line = String.format(" %s [label=<%s>]",id,finder.label());
|
||||
if (finder.isReference())
|
||||
referenceCluster.append(" ").append(line).append('\n');
|
||||
else
|
||||
out.println(line);
|
||||
}
|
||||
referenceCluster.append(" }");
|
||||
out.println(referenceCluster.toString());
|
||||
|
||||
for (final KBestSubHaplotypeFinder finder : finderByVertex.values())
|
||||
for (final Pair<? extends KBestSubHaplotypeFinder,String> subFinderLabel : finder.subFinderLabels()) {
|
||||
final KBestSubHaplotypeFinder subFinder = subFinderLabel.getFirst();
|
||||
|
||||
final String edgeLabel = subFinderLabel.getSecond();
|
||||
out.println(String.format(" %s -> %s [label=%s]",finder.id(),subFinder.id(),edgeLabel));
|
||||
}
|
||||
out.println("}");
|
||||
}
|
||||
|
||||
/**
|
||||
* Print a DOT representation of search graph.
|
||||
*
|
||||
* @param file file where to print the DOT representation to.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code file} is {@code null}.
|
||||
* @throws FileNotFoundException if {@code file} cannot be created or written.
|
||||
* @throws IllegalStateException if there was some trouble when writing the DOT representation.
|
||||
*/
|
||||
public void printDOT(final File file) throws FileNotFoundException {
|
||||
if (file == null)
|
||||
throw new IllegalArgumentException("the output file cannot be null");
|
||||
final PrintWriter out = new PrintWriter(file);
|
||||
printDOT(out);
|
||||
if (out.checkError())
|
||||
throw new IllegalStateException("error occurred while writing k-best haplotype search graph into file '"
|
||||
+ file.getAbsolutePath() + "'");
|
||||
out.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Print a DOT representation of search graph.
|
||||
*
|
||||
* @param fileName name of the file where to print the DOT representation to.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code fileName} is {@code null}.
|
||||
* @throws FileNotFoundException if no file named {@code fileName} cannot be created or written.
|
||||
* @throws IllegalStateException if there was some trouble when writing the DOT representation.
|
||||
*/
|
||||
@SuppressWarnings("unused") // Available for debugging purposes.
|
||||
public void printDOTFile(final String fileName) throws FileNotFoundException {
|
||||
printDOT(new File(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the score of a give sequence of bases
|
||||
*
|
||||
* @param bases the base sequence.
|
||||
*
|
||||
* @return {@link Double#NaN} if there is no score for the sequence, i.e. there is no such a haplotype accessible
|
||||
* throw this finder.
|
||||
*/
|
||||
public double score(final byte[] bases) {
|
||||
return topFinder.score(bases,0,bases.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the score of a give sequence of bases
|
||||
*
|
||||
* @param haplotype the haplotype.
|
||||
*
|
||||
* @return {@link Double#NaN} if there is no score for the sequence, i.e. there is no such a haplotype accessible
|
||||
* throw this finder.
|
||||
*/
|
||||
public double score(final Haplotype haplotype) {
|
||||
return score(haplotype.getBases());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a unique list of haplotypes solutions.
|
||||
* <p>
|
||||
* The result will not contain more than one haplotype with the same base sequence. The solution of the best
|
||||
* score is returned.
|
||||
* </p>
|
||||
* <p>
|
||||
* This makes sense when there are more than one possible path through the graph to create the same haplotype.
|
||||
* </p>
|
||||
* <p>
|
||||
* The resulting list is sorted by the score with more likely haplotype search results first.
|
||||
* </p>
|
||||
*
|
||||
* @param maxSize maximum number of unique results to return.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code maxSize} is negative.
|
||||
*
|
||||
* @return never {@code null}, perhaps an empty list.
|
||||
*/
|
||||
public List<KBestHaplotype> unique(final int maxSize) {
|
||||
if (maxSize < 0) throw new IllegalArgumentException("maxSize cannot be negative");
|
||||
final int requiredCapacity = Math.min(maxSize,size());
|
||||
final Set<Haplotype> haplotypes = new HashSet<>(requiredCapacity);
|
||||
int resultSize = 0;
|
||||
final List<KBestHaplotype> result = new ArrayList<>(requiredCapacity);
|
||||
for (final KBestHaplotype kbh : this) {
|
||||
if (haplotypes.add(kbh.haplotype())) {
|
||||
result.add(kbh);
|
||||
if (resultSize == maxSize) break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a unique list of haplotypes solutions.
|
||||
*
|
||||
* <p>
|
||||
* The result will not contain more than one haplotype with the same base sequence. The solution of the best
|
||||
* score is returned.
|
||||
* </p>
|
||||
* <p>
|
||||
* This makes sense when there are more than one possible path through the graph to create the same haplotype.
|
||||
* </p>
|
||||
* <p>
|
||||
* The resulting list is sorted by the score with more likely haplotype search results first.
|
||||
* </p>
|
||||
*
|
||||
* @return never {@code null}, perhaps an empty list.
|
||||
*/
|
||||
public List<KBestHaplotype> unique() {
|
||||
return unique(size());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,10 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Common interface for K-Best sub-haplotype finders.
|
||||
*
|
||||
|
|
@ -52,6 +56,29 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
|||
*/
|
||||
interface KBestSubHaplotypeFinder {
|
||||
|
||||
/**
|
||||
* Return an unique id for this sub-haplotype finder to be used when outputting diagrams.
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public String id();
|
||||
|
||||
/**
|
||||
* Returns a label with human readable representation of this finder.
|
||||
*
|
||||
* <p>This is used when generating a diagram to illustrate the search space and costs</p>
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public String label();
|
||||
|
||||
/**
|
||||
* Returns the set of subfinder from this finder together with a label for the connection with the current finder.
|
||||
*
|
||||
* <p>The label is used when generating a diagram to illustrate the search space and costs</p>
|
||||
*/
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder,String>> subFinderLabels();
|
||||
|
||||
/**
|
||||
* Returns the total number of possible sub-haplotypes.
|
||||
* @return 0 or greater.
|
||||
|
|
@ -67,5 +94,22 @@ interface KBestSubHaplotypeFinder {
|
|||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public abstract KBestHaplotype getKBest(int k);
|
||||
public KBestHaplotype getKBest(int k);
|
||||
|
||||
/**
|
||||
* Checks whether the top vertex for this finder is a reference haplotype vertex.
|
||||
*
|
||||
* @return {@code true} iff the top vertex for this finder is a reference vertex.
|
||||
*/
|
||||
public boolean isReference();
|
||||
|
||||
/**
|
||||
* Calculate the score of a sequence of bases.
|
||||
*
|
||||
* @param bases array containing the query base sequence.
|
||||
* @param offset first position of the query base sequence in {@code bases} .
|
||||
* @param length length of the query base sequence.
|
||||
* @return {@link Double#NaN} if there is no score for this sequence, otherwise a valid score value.
|
||||
*/
|
||||
public double score(byte[] bases, int offset, int length);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,20 +49,24 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
|||
import java.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* edge class for connecting nodes in the graph that tracks some per-sample information
|
||||
*
|
||||
* Edge class for connecting nodes in the graph that tracks some per-sample information.
|
||||
* <p>
|
||||
* This class extends BaseEdge with the additional functionality of tracking the maximum
|
||||
* multiplicity seen within any single sample. The workflow for using this class is:
|
||||
*
|
||||
* MultiSampleEdge e = new MultiSampleEdge(ref, 1)
|
||||
* e.incMultiplicity(1) // total is 2, per sample is 2, max per sample is 1
|
||||
* e.getPruningMultiplicity() // = 1
|
||||
* e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2
|
||||
* e.getPruningMultiplicity() // = 2
|
||||
* e.incMultiplicity(3) // total is 5, per sample is 3, max per sample is 2
|
||||
* e.getPruningMultiplicity() // = 2
|
||||
* e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3
|
||||
* e.getPruningMultiplicity() // = 3
|
||||
* </p>
|
||||
* <pre>
|
||||
* {@code
|
||||
* MultiSampleEdge e = new MultiSampleEdge(ref, 1)
|
||||
* e.incMultiplicity(1) // total is 2, per sample is 2, max per sample is 1
|
||||
* e.getPruningMultiplicity() // = 1
|
||||
* e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2
|
||||
* e.getPruningMultiplicity() // = 2
|
||||
* e.incMultiplicity(3) // total is 5, per sample is 3, max per sample is 2
|
||||
* e.getPruningMultiplicity() // = 2
|
||||
* e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3
|
||||
* e.getPruningMultiplicity() // = 3
|
||||
* }
|
||||
* </pre>
|
||||
*/
|
||||
public class MultiSampleEdge extends BaseEdge {
|
||||
private int currentSingleSampleMultiplicity;
|
||||
|
|
|
|||
|
|
@ -45,9 +45,10 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* General recursive sub-haplotype finder.
|
||||
|
|
@ -67,7 +68,11 @@ import java.util.Map;
|
|||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
||||
class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder<RecursiveSubHaplotypeFinder.EdgeSubHaplotypeFinder> {
|
||||
|
||||
|
||||
private final SeqVertex vertex;
|
||||
private final boolean isReference;
|
||||
|
||||
/**
|
||||
* Creates a recursive sub-haplotype finder give the target graph, first vertex and all possible outgoing edges
|
||||
|
|
@ -80,20 +85,83 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
* @param vertex first vertex for all sub-haplotype solutions provided by this finder
|
||||
* @param children map from outgoing edge to the corresponding sub-sub-haplotype finder.
|
||||
*/
|
||||
public RecursiveSubHaplotypeFinder(final SeqVertex vertex,
|
||||
public RecursiveSubHaplotypeFinder(final SeqGraph graph, final SeqVertex vertex,
|
||||
final Map<BaseEdge, KBestSubHaplotypeFinder> children) {
|
||||
super(createChildFinderCollection(vertex, children));
|
||||
this.vertex = vertex;
|
||||
this.isReference = graph.isReferenceNode(vertex);
|
||||
}
|
||||
|
||||
private static Collection<EdgeSubHaplotypeFinder> createChildFinderCollection(final SeqVertex vertex, final Map<BaseEdge,KBestSubHaplotypeFinder> finders) {
|
||||
/**
|
||||
* Wraps the descendant vertices finders in order to take advantage of the {@link AggregatedSubHaplotypeFinder}
|
||||
* common code.
|
||||
* <p>
|
||||
* Automatically calibrates the edge score (cost) so that it takes into account the total across all edges.
|
||||
* </p>
|
||||
*
|
||||
* @param vertex the parent vertex.
|
||||
* @param finders the child vertices indexed by the connecting edge.
|
||||
* @return never {@code null} but potentially an empty collection if there is child returning some sub-haplotype
|
||||
* solution.
|
||||
*/
|
||||
private static Collection<EdgeSubHaplotypeFinder> createChildFinderCollection(final SeqVertex vertex,
|
||||
final Map<BaseEdge,KBestSubHaplotypeFinder> finders) {
|
||||
if (finders == null) throw new IllegalArgumentException("the edge to child map cannot be null");
|
||||
final Collection<EdgeSubHaplotypeFinder> result = new ArrayList<>(finders.size());
|
||||
for (final Map.Entry<BaseEdge,KBestSubHaplotypeFinder> e : finders.entrySet())
|
||||
result.add(new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue()));
|
||||
final ArrayList<EdgeSubHaplotypeFinder> result = new ArrayList<>(finders.size());
|
||||
for (final Map.Entry<BaseEdge,KBestSubHaplotypeFinder> e : finders.entrySet()) {
|
||||
final EdgeSubHaplotypeFinder subFinder = new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue());
|
||||
if (subFinder.getCount() == 0) continue;
|
||||
result.add(subFinder);
|
||||
}
|
||||
if (result.size() == 0)
|
||||
return Collections.emptySet();
|
||||
else if (result.size() == 1) // no calibration needed, by default edgeScore is 0.
|
||||
return Collections.singleton(result.get(0));
|
||||
else {
|
||||
double totalEdgeMultiplicityAcrossEdges = 0;
|
||||
for (final EdgeSubHaplotypeFinder finder : result)
|
||||
totalEdgeMultiplicityAcrossEdges += Math.max(0.5,finder.edge.getMultiplicity());
|
||||
final double log10TotalEdgeMultiplicityAcrossEdges = Math.log10(totalEdgeMultiplicityAcrossEdges);
|
||||
for (final EdgeSubHaplotypeFinder finder : result)
|
||||
finder.calibrateEdgeScore(log10TotalEdgeMultiplicityAcrossEdges);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return isReference;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return vertex.getSequenceString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
final Set<Pair<? extends KBestSubHaplotypeFinder,String>> result = new LinkedHashSet<>(subFinders.size());
|
||||
for (final EdgeSubHaplotypeFinder subFinder : subFinders)
|
||||
result.add(new Pair<>(subFinder,simplifyZeros(String.format("%.4f", subFinder.edgeScore))));
|
||||
return result;
|
||||
}
|
||||
|
||||
private static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
||||
/**
|
||||
* Removes zeros decimal positions from edge-labels.
|
||||
*
|
||||
* @param edgeLabel the original label to reformat.
|
||||
* @return never {@code null}, the reformatted label.
|
||||
*/
|
||||
private String simplifyZeros(final String edgeLabel) {
|
||||
if (edgeLabel.equals("0.000") || edgeLabel.equals("-0.000") )
|
||||
return "0.";
|
||||
int i = edgeLabel.length() - 1;
|
||||
while (edgeLabel.charAt(i) == '0')
|
||||
i--;
|
||||
return (i == edgeLabel.length() - 1) ? edgeLabel : edgeLabel.substring(0,i);
|
||||
}
|
||||
|
||||
protected static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder {
|
||||
|
||||
private final KBestSubHaplotypeFinder childFinder;
|
||||
|
||||
|
|
@ -101,10 +169,32 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
|
||||
private final BaseEdge edge;
|
||||
|
||||
private double edgeScore = 0;
|
||||
|
||||
private EdgeSubHaplotypeFinder(final SeqVertex vertex, final BaseEdge edge, final KBestSubHaplotypeFinder childFinder) {
|
||||
this.childFinder = childFinder;
|
||||
this.edge = edge;
|
||||
this.vertex = vertex;
|
||||
this.edgeScore = 0;
|
||||
}
|
||||
|
||||
private void calibrateEdgeScore(final double log10TotalMultiplicityAcrossOutgoingEdges) {
|
||||
edgeScore = Math.log10(Math.max(edge.getMultiplicity(),0.5)) - log10TotalMultiplicityAcrossOutgoingEdges;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
return childFinder.id();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String label() {
|
||||
return childFinder.label();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Pair<? extends KBestSubHaplotypeFinder, String>> subFinderLabels() {
|
||||
return childFinder.subFinderLabels();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -114,8 +204,31 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
|
||||
@Override
|
||||
public KBestHaplotype getKBest(int k) {
|
||||
return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k));
|
||||
return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k),edgeScore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isReference() {
|
||||
return childFinder.isReference();
|
||||
}
|
||||
|
||||
@Override
|
||||
public double score(final byte[] bases, final int offset, final int length) {
|
||||
if (length == 0)
|
||||
return 0;
|
||||
final byte[] vertexSequence = vertex.getSequence();
|
||||
if (length < vertexSequence.length) // query is not long enough to have any score.
|
||||
return Double.NaN;
|
||||
else if (!Utils.equalRange(vertexSequence,0,bases,offset,vertexSequence.length))
|
||||
return Double.NaN;
|
||||
else
|
||||
return edgeScore + childFinder.score(bases,offset + vertexSequence.length,length - vertexSequence.length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
return "v" + vertex.getId();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -129,13 +242,14 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
*/
|
||||
private static class ChildKBestSubHaplotype extends KBestHaplotype {
|
||||
|
||||
private final int score;
|
||||
private final double score;
|
||||
private final KBestHaplotype child;
|
||||
private final SeqVertex vertex;
|
||||
private final boolean isReference;
|
||||
|
||||
public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child) {
|
||||
this.score = edge.getMultiplicity() + child.score();
|
||||
|
||||
public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child, final double edgeScore) {
|
||||
this.score = edgeScore + child.score();
|
||||
this.vertex = vertex;
|
||||
this.child = child;
|
||||
this.isReference = edge.isRef() && child.isReference();
|
||||
|
|
@ -147,7 +261,7 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int score() {
|
||||
public double score() {
|
||||
return score;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -291,16 +291,9 @@ public class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
final SeqVertex addedVertex = mergeLinearChainVertices(linearChain);
|
||||
addVertex(addedVertex);
|
||||
|
||||
final Set<BaseEdge> inEdges = incomingEdgesOf(first);
|
||||
final Set<BaseEdge> outEdges = outgoingEdgesOf(last);
|
||||
|
||||
final int nEdges = inEdges.size() + outEdges.size();
|
||||
int sharedWeightAmongEdges = nEdges == 0 ? 0 : sumEdgeWeightAlongChain(linearChain) / nEdges;
|
||||
final BaseEdge inc = new BaseEdge(false, sharedWeightAmongEdges); // template to make .add function call easy
|
||||
|
||||
// update the incoming and outgoing edges to point to the new vertex
|
||||
for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy().add(inc)); }
|
||||
for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy().add(inc)); }
|
||||
for( final BaseEdge edge : outgoingEdgesOf(last) ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy()); }
|
||||
for( final BaseEdge edge : incomingEdgesOf(first) ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy()); }
|
||||
|
||||
removeAllVertices(linearChain);
|
||||
return true;
|
||||
|
|
@ -313,29 +306,6 @@ public class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
return new SeqVertex( seqsCat );
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sum of the edge weights on a linear chain of at least 2 elements
|
||||
*
|
||||
* @param chain a linear chain of vertices with at least 2 vertices
|
||||
* @return the sum of the multiplicities along all edges connecting vertices within the chain
|
||||
*/
|
||||
@Requires({"chain != null", "chain.size() >= 2"})
|
||||
private int sumEdgeWeightAlongChain(final LinkedList<SeqVertex> chain) {
|
||||
int sum = 0;
|
||||
SeqVertex prev = null;
|
||||
|
||||
for ( final SeqVertex v : chain ) {
|
||||
if ( prev != null ) {
|
||||
final BaseEdge e = getEdge(prev, v);
|
||||
if ( e == null ) throw new IllegalStateException("Something wrong with the linear chain, got a null edge between " + prev + " and " + v);
|
||||
sum += e.getMultiplicity();
|
||||
}
|
||||
prev = v;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Base class for transformation operations that need to iterate over proposed vertices, where
|
||||
* each proposed vertex is a seed vertex for a potential transformation.
|
||||
|
|
|
|||
|
|
@ -247,12 +247,12 @@ public class SharedVertexSequenceSplitter {
|
|||
|
||||
if ( needPrefixNode ) {
|
||||
outer.addVertex(prefixV);
|
||||
if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0));
|
||||
if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 1));
|
||||
}
|
||||
|
||||
if ( needSuffixNode ) {
|
||||
outer.addVertex(suffixV);
|
||||
if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0));
|
||||
if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 1));
|
||||
}
|
||||
|
||||
if ( topForConnect != null ) {
|
||||
|
|
|
|||
|
|
@ -52,7 +52,6 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
|||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.jgrapht.alg.CycleDetector;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
|
@ -88,8 +87,7 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
|
|||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
final boolean debugGraphTransformations;
|
||||
private final boolean debugGraphTransformations;
|
||||
final byte minBaseQualityToUseInAssembly;
|
||||
|
||||
protected boolean increaseCountsBackwards = true;
|
||||
|
|
@ -319,13 +317,6 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
|
|||
removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if the graph has cycles, false otherwise
|
||||
*/
|
||||
public boolean hasCycles() {
|
||||
return new CycleDetector<>(this).detectCycles();
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the graph not have enough complexity? We define low complexity as a situation where the number
|
||||
* of non-unique kmers is more than 20% of the total number of kmers.
|
||||
|
|
@ -419,39 +410,10 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
|
|||
return counter.getKmersWithCountsAtLeast(2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert this kmer graph to a simple sequence graph.
|
||||
*
|
||||
* Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
|
||||
* graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
|
||||
*
|
||||
* @return a newly allocated SequenceGraph
|
||||
*/
|
||||
// TODO -- should override base class method
|
||||
@Override
|
||||
public SeqGraph convertToSequenceGraph() {
|
||||
buildGraphIfNecessary();
|
||||
|
||||
final SeqGraph seqGraph = new SeqGraph(kmerSize);
|
||||
final Map<MultiDeBruijnVertex, SeqVertex> vertexMap = new HashMap<>();
|
||||
|
||||
|
||||
// create all of the equivalent seq graph vertices
|
||||
for ( final MultiDeBruijnVertex dv : vertexSet() ) {
|
||||
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
|
||||
sv.setAdditionalInfo(dv.additionalInfo());
|
||||
vertexMap.put(dv, sv);
|
||||
seqGraph.addVertex(sv);
|
||||
}
|
||||
|
||||
// walk through the nodes and connect them to their equivalent seq vertices
|
||||
for( final MultiSampleEdge e : edgeSet() ) {
|
||||
final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
|
||||
final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
|
||||
//logger.info("Adding edge " + seqInV + " -> " + seqOutV);
|
||||
seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity()));
|
||||
}
|
||||
|
||||
return seqGraph;
|
||||
return super.convertToSequenceGraph();
|
||||
}
|
||||
|
||||
private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers,
|
||||
|
|
@ -749,15 +711,15 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme
|
|||
}
|
||||
|
||||
private static String pathElementId(final String element) {
|
||||
final int parentesysPos = element.indexOf('(');
|
||||
final int openBracketPosition = element.indexOf('(');
|
||||
|
||||
if (parentesysPos == -1)
|
||||
if (openBracketPosition == -1)
|
||||
return null;
|
||||
|
||||
final int closeParentesysPos = element.lastIndexOf(')');
|
||||
if (closeParentesysPos == -1)
|
||||
final int closeBracketPosition = element.lastIndexOf(')');
|
||||
if (closeBracketPosition == -1)
|
||||
throw new IllegalArgumentException("non-closed id parantesys found in element: " + element);
|
||||
final String result = element.substring(parentesysPos + 1,closeParentesysPos).trim();
|
||||
final String result = element.substring(openBracketPosition + 1,closeBracketPosition).trim();
|
||||
if (result.isEmpty())
|
||||
throw new IllegalArgumentException("empty id found in element: " + element);
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"f50e0b35e2240b19b1b8b6dfa0cf9796");
|
||||
"5ac3bfe1da1d411b52a98ef3debbd318");
|
||||
}
|
||||
|
||||
private void HCTestComplexConsensusMode(String bam, String args, String md5) {
|
||||
|
|
@ -106,7 +106,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleConsensusModeComplex() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337",
|
||||
"21e521d51b826450d348e5201684ffe4");
|
||||
"61972c7c0d378e756f3b4d99aed9d0cf");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,8 +68,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
|||
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50323a284788c8220c9226037c7003b5"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7c16aa8e35de9f418533efac3bae6551"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "7e1e193d70187774f9740d475e0f1cc1"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "96fea2caf0a40df3feb268e8b14da670"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "19efc8020f31d1b68d80c50df0629e50"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "d926d653500a970280ad7828d9ee2b84"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "83ddc16e4f0900429b2da30e582994aa"});
|
||||
|
|
|
|||
|
|
@ -227,7 +227,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestDBSNPAnnotationWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||
Arrays.asList("0998be22d7af4372247f5a0338f9446b"));
|
||||
Arrays.asList("7c3254ead383e2b9a51b242f6de2a5b2"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestDBSNPAnnotationWGSGraphBased() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||
Arrays.asList("1aeed297a3cb41940d83eac499a2ce07"));
|
||||
Arrays.asList("eda8f91091fe462205d687ec49fc61e7"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -276,7 +276,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestAggressivePcrIndelModelWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
|
||||
Arrays.asList("f426f4c2986e1dea8f3f55951ef8e013"));
|
||||
Arrays.asList("73c52372a1a80f052ea2b728ee17bf22"));
|
||||
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestConservativePcrIndelModelWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
|
||||
Arrays.asList("dcb38cb9280f2c3059a09d323db1c633"));
|
||||
Arrays.asList("4e10d49b8af23d5ef3a28cb702d10a4b"));
|
||||
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -298,4 +298,25 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
spec.disableShadowBCF();
|
||||
executeTest("testGraphBasedNoSuchEdgeBugFix", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLackSensitivityDueToBadHaplotypeSelectionFix() {
|
||||
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ",
|
||||
b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list",
|
||||
HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e2e6647f7c96e91aeead7301017dc800"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadLikelihoodsDueToBadHaplotypeSelectionFix() {
|
||||
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ",
|
||||
hg19RefereneWithChrPrefixInChromosomeNames, privateTestDir + "bad-likelihoods.bam", privateTestDir + "bad-likelihoods.interval_list",
|
||||
HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbda30145523bf05e0413157f1a00b3e"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testBadLikelihoodsDueToBadHaplotypeSelectionFix", spec);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
|
@ -61,7 +60,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
|
|||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
for ( final int nct : Arrays.asList(1, 2, 4) ) {
|
||||
tests.add(new Object[]{nct, "1f463bf3a06c401006858bc446ecea54"});
|
||||
tests.add(new Object[]{nct, "fd9324a574f9204f7308fc1af422fdcc"});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
|
|
|
|||
|
|
@ -52,10 +52,11 @@ import org.testng.annotations.DataProvider;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
public class CommonSuffixMergerUnitTest extends BaseTest {
|
||||
private final static boolean PRINT_GRAPHS = true;
|
||||
private final static boolean PRINT_GRAPHS = false;
|
||||
|
||||
@DataProvider(name = "CompleteCycleData")
|
||||
public Object[][] makeCompleteCycleData() {
|
||||
|
|
@ -134,11 +135,35 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
|
|||
return toUse.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares KBestHaplotype solutions, first by the haplotype base sequence and the by their score.
|
||||
*/
|
||||
private static final Comparator<KBestHaplotype> KBESTHAPLOTYPE_COMPARATOR = new Comparator<KBestHaplotype>() {
|
||||
|
||||
/**
|
||||
* Compares KBestHaplotype solutions, first by the haplotype base sequence and the by their score.
|
||||
*
|
||||
* @return {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public int compare(final KBestHaplotype o1,final KBestHaplotype o2) {
|
||||
final int baseCmp = o1.haplotype().getBaseString().compareTo(o2.haplotype().getBaseString());
|
||||
if (baseCmp != 0)
|
||||
return baseCmp;
|
||||
return - Double.compare(o1.score(),o2.score());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) {
|
||||
final KBestHaplotypeFinder originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks());
|
||||
final KBestHaplotypeFinder actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks());
|
||||
final List<KBestHaplotype> sortedOriginalKBestHaplotypes = new ArrayList<>(originalKBestHaplotypes);
|
||||
Collections.sort(sortedOriginalKBestHaplotypes, KBESTHAPLOTYPE_COMPARATOR);
|
||||
final List<KBestHaplotype> sortedActualKBestHaplotypes = new ArrayList<>(actualKBestHaplotypes);
|
||||
Collections.sort(sortedActualKBestHaplotypes, KBESTHAPLOTYPE_COMPARATOR);
|
||||
try {
|
||||
final Set<String> haplotypes = new HashSet<String>();
|
||||
final List<KBestHaplotype> originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks());
|
||||
final List<KBestHaplotype> actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks());
|
||||
|
||||
for (final KBestHaplotype kbh : originalKBestHaplotypes)
|
||||
haplotypes.add(new String(kbh.bases()));
|
||||
|
|
@ -148,14 +173,16 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
|
|||
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
|
||||
}
|
||||
|
||||
if ( actualKBestHaplotypes.size() == originalKBestHaplotypes.size() ) {
|
||||
for ( int i = 0; i < originalKBestHaplotypes.size(); i++ ) {
|
||||
Assert.assertTrue(actualKBestHaplotypes.get(i).haplotype().getBaseString().equals(originalKBestHaplotypes.get(i).haplotype().getBaseString()), "Paths not equal " + actualKBestHaplotypes.get(i).haplotype() + " vs. original " + originalKBestHaplotypes.get(i).haplotype());
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(sortedActualKBestHaplotypes,sortedOriginalKBestHaplotypes);
|
||||
} catch ( AssertionError e ) {
|
||||
if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0);
|
||||
if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0);
|
||||
try {
|
||||
if ( PRINT_GRAPHS ) originalKBestHaplotypes.printDOTFile(String.format("%s.original.finder.dot",name));
|
||||
if ( PRINT_GRAPHS ) actualKBestHaplotypes.printDOTFile(String.format("%s.actual.finder.dot",name));
|
||||
} catch (IOException e2) {
|
||||
// do nothing.
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,7 +65,6 @@ import java.util.*;
|
|||
* User: rpoplin
|
||||
* Date: 1/31/13
|
||||
*/
|
||||
|
||||
public class KBestHaplotypeFinderUnitTest extends BaseTest {
|
||||
|
||||
@DataProvider(name = "BasicPathFindingData")
|
||||
|
|
@ -113,11 +112,11 @@ public class KBestHaplotypeFinderUnitTest extends BaseTest {
|
|||
final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * nEndNodes;
|
||||
Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths");
|
||||
|
||||
int lastScore = Integer.MAX_VALUE;
|
||||
double lastScore = 0;
|
||||
for ( final KBestHaplotype kbh : paths ) {
|
||||
final Path<SeqVertex,BaseEdge> path = kbh.path();
|
||||
Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order. Path " + path + " has score above previous " + lastScore);
|
||||
lastScore = path.getScore();
|
||||
Assert.assertTrue(kbh.score() <= lastScore, "Paths out of order. Path " + path + " has score " + path.getScore() + " above previous " + lastScore);
|
||||
lastScore = kbh.score();
|
||||
}
|
||||
|
||||
// get the best path, and make sure it's the same as our optimal path overall
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -226,28 +227,34 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
final Set<String> haplotypes = new HashSet<>();
|
||||
final List<KBestHaplotype> originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks());
|
||||
final KBestHaplotypeFinder originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks());
|
||||
for ( final KBestHaplotype path : originalPaths )
|
||||
haplotypes.add(new String(path.bases()));
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
|
||||
splitter.split();
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0);
|
||||
if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0);
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".original.dot"), 0);
|
||||
if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".split.dot"), 0);
|
||||
splitter.updateGraph(top, bot);
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0);
|
||||
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".updated.dot"), 0);
|
||||
|
||||
final List<KBestHaplotype> splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks());
|
||||
final KBestHaplotypeFinder splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks());
|
||||
for ( final KBestHaplotype path : splitPaths ) {
|
||||
final String h = new String(path.bases());
|
||||
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
|
||||
}
|
||||
|
||||
if ( splitPaths.size() == originalPaths.size() ) {
|
||||
for ( int i = 0; i < originalPaths.size(); i++ ) {
|
||||
Assert.assertTrue(splitPaths.get(i).path().equalScoreAndSequence(originalPaths.get(i).path()), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
final List<byte[]> sortedOriginalPaths = new ArrayList<>(originalPaths.size());
|
||||
for (final KBestHaplotype kbh : originalPaths.unique())
|
||||
sortedOriginalPaths.add(kbh.bases());
|
||||
Collections.sort(sortedOriginalPaths, BaseUtils.BASES_COMPARATOR);
|
||||
final List<byte[]> sortedSplitPaths = new ArrayList<>(splitPaths.size());
|
||||
for (final KBestHaplotype kbh : splitPaths.unique())
|
||||
sortedSplitPaths.add(kbh.bases());
|
||||
Collections.sort(sortedSplitPaths, BaseUtils.BASES_COMPARATOR);
|
||||
|
||||
Assert.assertEquals(sortedSplitPaths,sortedOriginalPaths,Utils.join("_", strings) + "_" + hasTop + "_" + hasBot);
|
||||
}
|
||||
|
||||
@DataProvider(name = "MeetsMinSequenceData")
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
||||
|
|
@ -589,4 +590,26 @@ public class BaseUtils {
|
|||
throw new ReviewedStingException("base must be A, C, G or T. " + (char) base + " is not a valid base.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Lexicographical sorting of base arrays {@link Comparator}.
|
||||
*/
|
||||
public static final Comparator<byte[]> BASES_COMPARATOR = new Comparator<byte[]> (){
|
||||
|
||||
@Override
|
||||
public int compare(final byte[] o1,final byte[] o2) {
|
||||
final int minLength = Math.min(o1.length,o2.length);
|
||||
for (int i = 0; i < minLength; i++) {
|
||||
final int cmp = Byte.compare(o1[i],o2[i]);
|
||||
if (cmp != 0) return cmp;
|
||||
}
|
||||
if (o1.length == o2.length)
|
||||
return 0;
|
||||
else if (o1.length == minLength)
|
||||
return -1;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -852,4 +852,34 @@ public class Utils {
|
|||
|
||||
return lst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares sections from to byte arrays to verify whether they contain the same values.
|
||||
*
|
||||
* @param left first array to compare.
|
||||
* @param leftOffset first position of the first array to compare.
|
||||
* @param right second array to compare.
|
||||
* @param rightOffset first position of the second array to compare.
|
||||
* @param length number of positions to compare.
|
||||
*
|
||||
* @throws IllegalArgumentException if <ul>
|
||||
* <li>either {@code left} or {@code right} is {@code null} or</li>
|
||||
* <li>any off the offset or length combine point outside any of the two arrays</li>
|
||||
* </ul>
|
||||
* @return {@code true} iff {@code length} is 0 or all the bytes in both ranges are the same two-by-two.
|
||||
*/
|
||||
public static boolean equalRange(final byte[] left, final int leftOffset, byte[] right, final int rightOffset, final int length) {
|
||||
if (left == null) throw new IllegalArgumentException("left cannot be null");
|
||||
if (right == null) throw new IllegalArgumentException("right cannot be null");
|
||||
if (length < 0) throw new IllegalArgumentException("the length cannot be negative");
|
||||
if (leftOffset < 0) throw new IllegalArgumentException("left offset cannot be negative");
|
||||
if (leftOffset + length > left.length) throw new IllegalArgumentException("length goes beyond end of left array");
|
||||
if (rightOffset < 0) throw new IllegalArgumentException("right offset cannot be negative");
|
||||
if (rightOffset + length > right.length) throw new IllegalArgumentException("length goes beyond end of right array");
|
||||
|
||||
for (int i = 0; i < length; i++)
|
||||
if (left[leftOffset + i] != right[rightOffset + i])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ public class Haplotype extends Allele {
|
|||
private EventMap eventMap = null;
|
||||
private Cigar cigar;
|
||||
private int alignmentStartHapwrtRef;
|
||||
private double score = 0;
|
||||
private double score = Double.NaN;
|
||||
|
||||
/**
|
||||
* Main constructor
|
||||
|
|
@ -301,7 +301,7 @@ public class Haplotype extends Allele {
|
|||
* @return a double, where higher values are better
|
||||
*/
|
||||
public double getScore() {
|
||||
return this.isReference() ? Double.MAX_VALUE : score;
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -312,7 +312,7 @@ public class Haplotype extends Allele {
|
|||
* @param score a double, where higher values are better
|
||||
*/
|
||||
public void setScore(double score) {
|
||||
this.score = this.isReference() ? Double.MAX_VALUE : score;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -91,6 +91,7 @@ public abstract class BaseTest {
|
|||
//public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta";
|
||||
public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta";
|
||||
public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta";
|
||||
public static final String hg19RefereneWithChrPrefixInChromosomeNames = "/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta";
|
||||
public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/";
|
||||
public static final String validationDataLocation = GATKDataLocation + "Validation_Data/";
|
||||
public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/";
|
||||
|
|
|
|||
|
|
@ -26,9 +26,16 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Random;
|
||||
|
||||
|
||||
public class BaseUtilsUnitTest extends BaseTest {
|
||||
|
|
@ -123,4 +130,50 @@ public class BaseUtilsUnitTest extends BaseTest {
|
|||
|
||||
Assert.assertTrue(rcObs.equals(rcExp));
|
||||
}
|
||||
|
||||
@Test(dataProvider="baseComparatorData")
|
||||
public void testBaseComparator(final Collection<byte[]> basesToSort) {
|
||||
final ArrayList<byte[]> sorted = new ArrayList<>(basesToSort);
|
||||
Collections.sort(sorted, BaseUtils.BASES_COMPARATOR);
|
||||
for (int i = 0; i < sorted.size(); i++) {
|
||||
Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(i)),0);
|
||||
final String iString = new String(sorted.get(i));
|
||||
for (int j = i; j < sorted.size(); j++) {
|
||||
final String jString = new String(sorted.get(j));
|
||||
if (iString.compareTo(jString) == 0)
|
||||
Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)),0);
|
||||
else
|
||||
Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) * iString.compareTo(jString) > 0);
|
||||
Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) <= 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name="baseComparatorData")
|
||||
public Object[][] baseComparatorData() {
|
||||
final int testCount = 10;
|
||||
final int testSizeAverage = 10;
|
||||
final int testSizeDeviation = 10;
|
||||
final int haplotypeSizeAverage = 100;
|
||||
final int haplotypeSizeDeviation = 100;
|
||||
|
||||
final Object[][] result = new Object[testCount][];
|
||||
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
|
||||
|
||||
for (int i = 0; i < testCount; i++) {
|
||||
final int size = (int) Math.max(0,rnd.nextDouble() * testSizeDeviation + testSizeAverage);
|
||||
final ArrayList<byte[]> bases = new ArrayList<>(size);
|
||||
for (int j = 0; j < size; j++) {
|
||||
final int jSize = (int) Math.max(0,rnd.nextDouble() * haplotypeSizeDeviation + haplotypeSizeAverage);
|
||||
final byte[] b = new byte[jSize];
|
||||
for (int k = 0; k < jSize; k++)
|
||||
b[k] = BaseUtils.baseIndexToSimpleBase(rnd.nextInt(4));
|
||||
bases.add(b);
|
||||
}
|
||||
result[i] = new Object[] { bases };
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -236,4 +236,23 @@ public class UtilsUnitTest extends BaseTest {
|
|||
public void testTrim(final String s, final int frontTrim, final int backTrim) {
|
||||
Assert.assertEquals(s.length() - frontTrim - backTrim, Utils.trimArray(s.getBytes(), frontTrim, backTrim).length);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "equalRange", enabled = true)
|
||||
public void testEqualRange(final byte[] array1, final byte[] array2, final int offset1, final int offset2, final int length, final boolean expected) {
|
||||
Assert.assertEquals(Utils.equalRange(array1,offset1,array2,offset2,length),expected);
|
||||
Assert.assertTrue(Utils.equalRange(array1,offset1,array1,offset1,length));
|
||||
Assert.assertTrue(Utils.equalRange(array2,offset2,array2,offset2,length));
|
||||
|
||||
}
|
||||
|
||||
@DataProvider(name = "equalRangeData")
|
||||
public Object[][] equalRangeData() {
|
||||
return new Object[][] {
|
||||
new Object[] { new byte[0] , new byte[0], 0, 0, 0, true},
|
||||
new Object[] { "ABCF".getBytes(), "BC".getBytes(), 1,0,2, true },
|
||||
new Object[] { "ABCF".getBytes(), "".getBytes(), 1,0,0, true },
|
||||
new Object[] { "ABCF".getBytes(), "ACBF".getBytes(), 0,0, 4, false}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue