diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 8dfeed987..b53445933 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -213,16 +213,18 @@ public abstract class LocalAssemblyEngine { final Map assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) { // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes final Set returnHaplotypes = new LinkedHashSet<>(); - returnHaplotypes.add( refHaplotype ); final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); + final ArrayList finders = new ArrayList<>(graphs.size()); for( final SeqGraph graph : graphs ) { final SeqVertex source = graph.getReferenceSourceVertex(); final SeqVertex sink = graph.getReferenceSinkVertex(); if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph); final KBestHaplotypeFinder haplotypeFinder = new KBestHaplotypeFinder(graph,source,sink); + finders.add(haplotypeFinder); final Iterator bestHaplotypes = haplotypeFinder.iterator(numBestHaplotypesPerGraph); + while (bestHaplotypes.hasNext()) { final KBestHaplotype kBestHaplotype = bestHaplotypes.next(); final Haplotype h = kBestHaplotype.haplotype(); @@ -256,9 +258,19 @@ public abstract class LocalAssemblyEngine { } } - - if ( returnHaplotypes.size() < returnHaplotypes.size() ) - logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); + // Make sure that the ref haplotype is amongst the return haplotypes and calculate its score as + // the first returned by any finder. + if (!returnHaplotypes.contains(refHaplotype)) { + double refScore = Double.NaN; + for (final KBestHaplotypeFinder finder : finders) { + final double candidate = finder.score(refHaplotype); + if (Double.isNaN(candidate)) continue; + refScore = candidate; + break; + } + refHaplotype.setScore(refScore); + returnHaplotypes.add(refHaplotype); + } if( debug ) { if( returnHaplotypes.size() > 1 ) { diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java index 8fba6c9d5..d981e6eeb 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java @@ -45,21 +45,21 @@ */ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; -import java.util.ArrayList; -import java.util.Collection; -import java.util.PriorityQueue; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.*; /** * K-best sub-haplotype finder that selects the best solutions out of a collection of sub-haplotype finders. * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ -class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder { +class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder { /** * Collection of subFinders that provided the actual solutions. */ - private final Collection subFinders; + protected final Collection subFinders; /** * Flag indicating whether the sub-finders have been processed or not. @@ -89,17 +89,53 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder { * Creates a new aggregated sub-haplotype finder given its sub-finders. * @param finders set of sub-finders. */ - public AggregatedSubHaplotypeFinder(final Collection finders) { + public AggregatedSubHaplotypeFinder(final Collection finders) { if (finders == null) throw new IllegalArgumentException("finder collection cannot be null"); this.subFinders = finders; } + @Override + public String id() { + final StringBuilder resultBuilder = new StringBuilder(); + for (final KBestSubHaplotypeFinder subFinder : subFinders) + resultBuilder.append(subFinder.id()); + return resultBuilder.toString(); + } + + @Override + public String label() { + return "<OR>"; + } + + @Override + public Set> subFinderLabels() { + final int subFinderCount = subFinders.size(); + final String edgeCost = String.format("%.2f",-Math.log10((double) subFinderCount)); + final Set> result = new LinkedHashSet<>(subFinderCount); + for (final KBestSubHaplotypeFinder subFinder : subFinders) + result.add(new Pair<>(subFinder,edgeCost)); + return result; + } + @Override public int getCount() { processSubFindersIfNeeded(); return count; } + @Override + public double score(final byte[] bases, final int offset, final int length) { + if (bases == null) throw new IllegalArgumentException("bases cannot be null"); + if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative"); + if (length < 0) throw new IllegalArgumentException("the length cannot be negative"); + if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size"); + for (final KBestSubHaplotypeFinder subFinder : subFinders) { + final double score = subFinder.score(bases,offset,length); + if (!Double.isNaN(score)) return score; + } + return Double.NaN; + } + private void processSubFindersIfNeeded() { if (processedSubFinders) return; @@ -144,6 +180,11 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder { return rankedSubHaplotype.get(k); } + @Override + public boolean isReference() { + return false; + } + /** * Custom implementation of {@link KBestHaplotype} to encapsulate sub-finder results. */ @@ -167,7 +208,7 @@ class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder { } @Override - public int score() { + public double score() { return result.score(); } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 36216bdd2..3abd2e4bc 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -52,6 +52,7 @@ import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.jgrapht.EdgeFactory; +import org.jgrapht.alg.CycleDetector; import org.jgrapht.graph.DefaultDirectedGraph; import java.io.File; @@ -146,6 +147,39 @@ public class BaseGraph extends Default return set; } + /** + * Convert this kmer graph to a simple sequence graph. + * + * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer + * graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence + * + * @return a newly allocated SequenceGraph + */ + public SeqGraph convertToSequenceGraph() { + + final SeqGraph seqGraph = new SeqGraph(kmerSize); + final Map vertexMap = new HashMap<>(); + + + // create all of the equivalent seq graph vertices + for ( final V dv : vertexSet() ) { + final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv))); + sv.setAdditionalInfo(dv.additionalInfo()); + vertexMap.put(dv, sv); + seqGraph.addVertex(sv); + } + + // walk through the nodes and connect them to their equivalent seq vertices + for( final E e : edgeSet() ) { + final SeqVertex seqInV = vertexMap.get(getEdgeSource(e)); + final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e)); + //logger.info("Adding edge " + seqInV + " -> " + seqOutV); + seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity())); + } + + return seqGraph; + } + /** * Pull out the additional sequence implied by traversing this node in the graph * @param v the vertex from which to pull out the additional base sequence @@ -712,4 +746,13 @@ public class BaseGraph extends Default if (!containsVertex(vertex)) return false; return true; } + + /** + * Checks for the presence of directed cycles in the graph. + * + * @return {@code true} if the graph has cycles, {@code false} otherwise. + */ + public boolean hasCycles() { + return new CycleDetector<>(this).detectCycles(); + } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java index 69b42cee6..71ce9929b 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java @@ -122,7 +122,7 @@ public class CommonSuffixSplitter { } else { incomingTarget = prefixV; graph.addVertex(prefixV); - graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0)); + graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 1)); edgesToRemove.add(out); } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java index ae270ed7b..5ceaa29c5 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java @@ -45,6 +45,11 @@ */ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.Collections; +import java.util.Set; + /** * Represents a trivial k-best sub haplotype finder with no solutions. * @@ -65,6 +70,21 @@ final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder { protected DeadEndKBestSubHaplotypeFinder() { } + @Override + public String id() { + return ""; + } + + @Override + public String label() { + return "<DEAD>"; + } + + @Override + public Set> subFinderLabels() { + return Collections.emptySet(); + } + @Override public int getCount() { return 0; @@ -77,4 +97,18 @@ final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder { else throw new IllegalArgumentException("k cannot be equal or greater to the haplotype count"); } + + @Override + public boolean isReference() { + return false; + } + + @Override + public double score(final byte[] bases, final int offset, final int length) { + if (bases == null) throw new IllegalArgumentException("bases cannot be null"); + if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative"); + if (length < 0) throw new IllegalArgumentException("the length cannot be negative"); + if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size"); + return Double.NaN; + } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java index 0e50ec02b..1a642d200 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java @@ -45,6 +45,12 @@ */ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.Collections; +import java.util.Set; + /** * Trivial k-best sub-haplotype finder where the source and sink vertex are the same one. * @@ -67,6 +73,21 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder { singleHaplotypePath = new MyBestHaplotypePath(graph,vertex); } + @Override + public String id() { + return "v" + singleHaplotypePath.head().getId(); + } + + @Override + public String label() { + return singleHaplotypePath.head().getSequenceString(); + } + + @Override + public Set> subFinderLabels() { + return Collections.emptySet(); + } + @Override public int getCount() { return 1; @@ -81,6 +102,24 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder { return singleHaplotypePath; } + @Override + public boolean isReference() { + return singleHaplotypePath.isReference(); + } + + @Override + public double score(final byte[] bases, final int offset, final int length) { + if (bases == null) throw new IllegalArgumentException("bases cannot be null"); + if (offset < 0) throw new IllegalArgumentException("the offset cannot be negative"); + if (length < 0) throw new IllegalArgumentException("the length cannot be negative"); + if (offset + length > bases.length) throw new IllegalArgumentException("the offset and length go beyond the array size"); + final byte[] vertexBases = singleHaplotypePath.head().getSequence(); + if (length != vertexBases.length) + return Double.NaN; + else + return Utils.equalRange(bases, offset, vertexBases, 0, length)? 0 : Double.NaN; + } + /** * Custom extension of {@link KBestHaplotype} that implements the single solution behaviour. */ @@ -120,7 +159,7 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder { } @Override - public int score() { + public double score() { return 0; } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java index ca22f17ec..d1b5bd614 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java @@ -68,7 +68,7 @@ public abstract class KBestHaplotype implements Comparable { * * @return 0 or greater. */ - public abstract int score(); + public abstract double score(); /** * Indicates whether this result is the reference haplotype. @@ -122,6 +122,8 @@ public abstract class KBestHaplotype implements Comparable { public Haplotype haplotype() { if (haplotype != null) return haplotype; haplotype = new Haplotype(bases(),isReference()); + if (score() > 0) + throw new IllegalStateException("score cannot be greater than 0: " + score()); haplotype.setScore(score()); return haplotype; } @@ -152,7 +154,35 @@ public abstract class KBestHaplotype implements Comparable { */ public int compareTo(final KBestHaplotype other) { if (other == null) throw new IllegalArgumentException("the other object cannot be null"); - return - 1 * (score() - other.score()); + return - Double.compare(score(), other.score()); + } + + @Override + public int hashCode() { + return haplotype().hashCode(); + } + + @Override + public boolean equals(final Object other) { + return other == null ? false: (other instanceof KBestHaplotype ? equals((KBestHaplotype)other) : false); + } + + @Override + public String toString() { + return haplotype().toString() + " Score = " + score(); + } + + /** + * Checks whether both solutions are equal. + *

+ * Both solutions are considered equal when the underlying haplotypes are equal. The path on the respective + * graph might deffer though. + *

+ * + * @return {@code true} iff both haplotypes are the same (considering the ref state). + */ + protected boolean equals(final KBestHaplotype other) { + return haplotype().equals(other.haplotype(),false); } /** diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java index f27cca12c..5e971792c 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java @@ -45,8 +45,13 @@ */ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.jgrapht.alg.CycleDetector; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintWriter; import java.util.*; /** @@ -233,7 +238,7 @@ public class KBestHaplotypeFinder extends AbstractList implement } @Override - public KBestHaplotype get(int index) { + public KBestHaplotype get(final int index) { if (index < 0 || index >= size()) throw new IndexOutOfBoundsException(); return topFinder.getKBest(index); @@ -305,28 +310,28 @@ public class KBestHaplotypeFinder extends AbstractList implement /** * Creates a finder from a vertex. * - * @param source the source vertex for the finder. + * @param vertex the source vertex for the finder. * * @return never {@code null}, perhaps a finder that returns no haplotypes though. */ - protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex source) { - KBestSubHaplotypeFinder node = finderByVertex.get(source); - if (node == null) { - if (sinks.contains(source)) - node = new EmptyPathHaplotypeFinderNode(graph,source); + protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex vertex) { + KBestSubHaplotypeFinder finder = finderByVertex.get(vertex); + if (finder == null) { + if (sinks.contains(vertex)) + finder = new EmptyPathHaplotypeFinderNode(graph,vertex); else { - final Set outgoingEdges = graph.outgoingEdgesOf(source); + final Set outgoingEdges = graph.outgoingEdgesOf(vertex); if (outgoingEdges.isEmpty()) - node = DeadEndKBestSubHaplotypeFinder.INSTANCE; + finder = DeadEndKBestSubHaplotypeFinder.INSTANCE; else { final Map undeadChildren = createChildrenFinders(outgoingEdges); - node = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE : - new RecursiveSubHaplotypeFinder(source,undeadChildren); + finder = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE : + new RecursiveSubHaplotypeFinder(graph,vertex,undeadChildren); } } - finderByVertex.put(source, node); + finderByVertex.put(vertex, finder); } - return node; + return finder; } /** @@ -340,7 +345,7 @@ public class KBestHaplotypeFinder extends AbstractList implement * @return never {@code null}, perhaps an empty map if there is no children with valid paths to any sink for this * finder. */ - private Map createChildrenFinders(Set baseEdges) { + private Map createChildrenFinders(final Set baseEdges) { final Map result = new LinkedHashMap<>(baseEdges.size()); for (final BaseEdge edge : baseEdges) { final KBestSubHaplotypeFinder targetFinder = createVertexFinder(graph.getEdgeTarget(edge)); @@ -349,4 +354,156 @@ public class KBestHaplotypeFinder extends AbstractList implement } return result; } + + /** + * Print a DOT representation of search graph. + * + * @param out character stream printer where to print the DOT representation to. + * + * @throws IllegalArgumentException if {@code out} is {@code null}. + */ + public void printDOT(final PrintWriter out) { + if (out == null) + throw new IllegalArgumentException("the out writer cannot be null"); + out.println("digraph {"); + out.println(" rankdir = LR"); + out.println(" node [shape=box, margin=0.01]"); + out.println(" subgraph cluster_dummy { style = invis; x [label=\"\",shape=none,margin=0] }"); + final StringBuilder referenceCluster = new StringBuilder(1000); + + referenceCluster.append(" subgraph cluster_ref {\n"); + referenceCluster.append(" node [penwidth=2]\n"); + for (final KBestSubHaplotypeFinder finder : finderByVertex.values() ) { + final String id = finder.id(); + final String line = String.format(" %s [label=<%s>]",id,finder.label()); + if (finder.isReference()) + referenceCluster.append(" ").append(line).append('\n'); + else + out.println(line); + } + referenceCluster.append(" }"); + out.println(referenceCluster.toString()); + + for (final KBestSubHaplotypeFinder finder : finderByVertex.values()) + for (final Pair subFinderLabel : finder.subFinderLabels()) { + final KBestSubHaplotypeFinder subFinder = subFinderLabel.getFirst(); + + final String edgeLabel = subFinderLabel.getSecond(); + out.println(String.format(" %s -> %s [label=%s]",finder.id(),subFinder.id(),edgeLabel)); + } + out.println("}"); + } + + /** + * Print a DOT representation of search graph. + * + * @param file file where to print the DOT representation to. + * + * @throws IllegalArgumentException if {@code file} is {@code null}. + * @throws FileNotFoundException if {@code file} cannot be created or written. + * @throws IllegalStateException if there was some trouble when writing the DOT representation. + */ + public void printDOT(final File file) throws FileNotFoundException { + if (file == null) + throw new IllegalArgumentException("the output file cannot be null"); + final PrintWriter out = new PrintWriter(file); + printDOT(out); + if (out.checkError()) + throw new IllegalStateException("error occurred while writing k-best haplotype search graph into file '" + + file.getAbsolutePath() + "'"); + out.close(); + } + + /** + * Print a DOT representation of search graph. + * + * @param fileName name of the file where to print the DOT representation to. + * + * @throws IllegalArgumentException if {@code fileName} is {@code null}. + * @throws FileNotFoundException if no file named {@code fileName} cannot be created or written. + * @throws IllegalStateException if there was some trouble when writing the DOT representation. + */ + @SuppressWarnings("unused") // Available for debugging purposes. + public void printDOTFile(final String fileName) throws FileNotFoundException { + printDOT(new File(fileName)); + } + + /** + * Get the score of a give sequence of bases + * + * @param bases the base sequence. + * + * @return {@link Double#NaN} if there is no score for the sequence, i.e. there is no such a haplotype accessible + * throw this finder. + */ + public double score(final byte[] bases) { + return topFinder.score(bases,0,bases.length); + } + + /** + * Get the score of a give sequence of bases + * + * @param haplotype the haplotype. + * + * @return {@link Double#NaN} if there is no score for the sequence, i.e. there is no such a haplotype accessible + * throw this finder. + */ + public double score(final Haplotype haplotype) { + return score(haplotype.getBases()); + } + + + /** + * Returns a unique list of haplotypes solutions. + *

+ * The result will not contain more than one haplotype with the same base sequence. The solution of the best + * score is returned. + *

+ *

+ * This makes sense when there are more than one possible path through the graph to create the same haplotype. + *

+ *

+ * The resulting list is sorted by the score with more likely haplotype search results first. + *

+ * + * @param maxSize maximum number of unique results to return. + * + * @throws IllegalArgumentException if {@code maxSize} is negative. + * + * @return never {@code null}, perhaps an empty list. + */ + public List unique(final int maxSize) { + if (maxSize < 0) throw new IllegalArgumentException("maxSize cannot be negative"); + final int requiredCapacity = Math.min(maxSize,size()); + final Set haplotypes = new HashSet<>(requiredCapacity); + int resultSize = 0; + final List result = new ArrayList<>(requiredCapacity); + for (final KBestHaplotype kbh : this) { + if (haplotypes.add(kbh.haplotype())) { + result.add(kbh); + if (resultSize == maxSize) break; + } + } + return result; + } + + /** + * Returns a unique list of haplotypes solutions. + * + *

+ * The result will not contain more than one haplotype with the same base sequence. The solution of the best + * score is returned. + *

+ *

+ * This makes sense when there are more than one possible path through the graph to create the same haplotype. + *

+ *

+ * The resulting list is sorted by the score with more likely haplotype search results first. + *

+ * + * @return never {@code null}, perhaps an empty list. + */ + public List unique() { + return unique(size()); + } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java index 9c185b52c..eb3360500 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java @@ -45,6 +45,10 @@ */ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.Set; + /** * Common interface for K-Best sub-haplotype finders. * @@ -52,6 +56,29 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; */ interface KBestSubHaplotypeFinder { + /** + * Return an unique id for this sub-haplotype finder to be used when outputting diagrams. + * + * @return never {@code null}. + */ + public String id(); + + /** + * Returns a label with human readable representation of this finder. + * + *

This is used when generating a diagram to illustrate the search space and costs

+ * + * @return never {@code null}. + */ + public String label(); + + /** + * Returns the set of subfinder from this finder together with a label for the connection with the current finder. + * + *

The label is used when generating a diagram to illustrate the search space and costs

+ */ + public Set> subFinderLabels(); + /** * Returns the total number of possible sub-haplotypes. * @return 0 or greater. @@ -67,5 +94,22 @@ interface KBestSubHaplotypeFinder { * * @return never {@code null}. */ - public abstract KBestHaplotype getKBest(int k); + public KBestHaplotype getKBest(int k); + + /** + * Checks whether the top vertex for this finder is a reference haplotype vertex. + * + * @return {@code true} iff the top vertex for this finder is a reference vertex. + */ + public boolean isReference(); + + /** + * Calculate the score of a sequence of bases. + * + * @param bases array containing the query base sequence. + * @param offset first position of the query base sequence in {@code bases} . + * @param length length of the query base sequence. + * @return {@link Double#NaN} if there is no score for this sequence, otherwise a valid score value. + */ + public double score(byte[] bases, int offset, int length); } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java index 978d83eb4..657ecfd85 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java @@ -49,20 +49,24 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import java.util.PriorityQueue; /** - * edge class for connecting nodes in the graph that tracks some per-sample information - * + * Edge class for connecting nodes in the graph that tracks some per-sample information. + *

* This class extends BaseEdge with the additional functionality of tracking the maximum * multiplicity seen within any single sample. The workflow for using this class is: - * - * MultiSampleEdge e = new MultiSampleEdge(ref, 1) - * e.incMultiplicity(1) // total is 2, per sample is 2, max per sample is 1 - * e.getPruningMultiplicity() // = 1 - * e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2 - * e.getPruningMultiplicity() // = 2 - * e.incMultiplicity(3) // total is 5, per sample is 3, max per sample is 2 - * e.getPruningMultiplicity() // = 2 - * e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3 - * e.getPruningMultiplicity() // = 3 + *

+ *
+ * {@code
+ *      MultiSampleEdge e = new MultiSampleEdge(ref, 1)
+ *      e.incMultiplicity(1)              // total is 2, per sample is 2, max per sample is 1
+ *      e.getPruningMultiplicity()        // = 1
+ *      e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2
+ *      e.getPruningMultiplicity()        // = 2
+ *      e.incMultiplicity(3)              // total is 5, per sample is 3, max per sample is 2
+ *      e.getPruningMultiplicity()        // = 2
+ *      e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3
+ *      e.getPruningMultiplicity()        // = 3
+ * }
+ * 
*/ public class MultiSampleEdge extends BaseEdge { private int currentSingleSampleMultiplicity; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java index 0fbbfdc64..fe85befef 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java @@ -45,9 +45,10 @@ */ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Map; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.*; /** * General recursive sub-haplotype finder. @@ -67,7 +68,11 @@ import java.util.Map; * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ -class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder { +class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder { + + + private final SeqVertex vertex; + private final boolean isReference; /** * Creates a recursive sub-haplotype finder give the target graph, first vertex and all possible outgoing edges @@ -80,20 +85,83 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder { * @param vertex first vertex for all sub-haplotype solutions provided by this finder * @param children map from outgoing edge to the corresponding sub-sub-haplotype finder. */ - public RecursiveSubHaplotypeFinder(final SeqVertex vertex, + public RecursiveSubHaplotypeFinder(final SeqGraph graph, final SeqVertex vertex, final Map children) { super(createChildFinderCollection(vertex, children)); + this.vertex = vertex; + this.isReference = graph.isReferenceNode(vertex); } - private static Collection createChildFinderCollection(final SeqVertex vertex, final Map finders) { + /** + * Wraps the descendant vertices finders in order to take advantage of the {@link AggregatedSubHaplotypeFinder} + * common code. + *

+ * Automatically calibrates the edge score (cost) so that it takes into account the total across all edges. + *

+ * + * @param vertex the parent vertex. + * @param finders the child vertices indexed by the connecting edge. + * @return never {@code null} but potentially an empty collection if there is child returning some sub-haplotype + * solution. + */ + private static Collection createChildFinderCollection(final SeqVertex vertex, + final Map finders) { if (finders == null) throw new IllegalArgumentException("the edge to child map cannot be null"); - final Collection result = new ArrayList<>(finders.size()); - for (final Map.Entry e : finders.entrySet()) - result.add(new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue())); + final ArrayList result = new ArrayList<>(finders.size()); + for (final Map.Entry e : finders.entrySet()) { + final EdgeSubHaplotypeFinder subFinder = new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue()); + if (subFinder.getCount() == 0) continue; + result.add(subFinder); + } + if (result.size() == 0) + return Collections.emptySet(); + else if (result.size() == 1) // no calibration needed, by default edgeScore is 0. + return Collections.singleton(result.get(0)); + else { + double totalEdgeMultiplicityAcrossEdges = 0; + for (final EdgeSubHaplotypeFinder finder : result) + totalEdgeMultiplicityAcrossEdges += Math.max(0.5,finder.edge.getMultiplicity()); + final double log10TotalEdgeMultiplicityAcrossEdges = Math.log10(totalEdgeMultiplicityAcrossEdges); + for (final EdgeSubHaplotypeFinder finder : result) + finder.calibrateEdgeScore(log10TotalEdgeMultiplicityAcrossEdges); + return result; + } + } + + @Override + public boolean isReference() { + return isReference; + } + + @Override + public String label() { + return vertex.getSequenceString(); + } + + @Override + public Set> subFinderLabels() { + final Set> result = new LinkedHashSet<>(subFinders.size()); + for (final EdgeSubHaplotypeFinder subFinder : subFinders) + result.add(new Pair<>(subFinder,simplifyZeros(String.format("%.4f", subFinder.edgeScore)))); return result; } - private static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder { + /** + * Removes zeros decimal positions from edge-labels. + * + * @param edgeLabel the original label to reformat. + * @return never {@code null}, the reformatted label. + */ + private String simplifyZeros(final String edgeLabel) { + if (edgeLabel.equals("0.000") || edgeLabel.equals("-0.000") ) + return "0."; + int i = edgeLabel.length() - 1; + while (edgeLabel.charAt(i) == '0') + i--; + return (i == edgeLabel.length() - 1) ? edgeLabel : edgeLabel.substring(0,i); + } + + protected static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder { private final KBestSubHaplotypeFinder childFinder; @@ -101,10 +169,32 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder { private final BaseEdge edge; + private double edgeScore = 0; + private EdgeSubHaplotypeFinder(final SeqVertex vertex, final BaseEdge edge, final KBestSubHaplotypeFinder childFinder) { this.childFinder = childFinder; this.edge = edge; this.vertex = vertex; + this.edgeScore = 0; + } + + private void calibrateEdgeScore(final double log10TotalMultiplicityAcrossOutgoingEdges) { + edgeScore = Math.log10(Math.max(edge.getMultiplicity(),0.5)) - log10TotalMultiplicityAcrossOutgoingEdges; + } + + @Override + public String id() { + return childFinder.id(); + } + + @Override + public String label() { + return childFinder.label(); + } + + @Override + public Set> subFinderLabels() { + return childFinder.subFinderLabels(); } @Override @@ -114,8 +204,31 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder { @Override public KBestHaplotype getKBest(int k) { - return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k)); + return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k),edgeScore); } + + @Override + public boolean isReference() { + return childFinder.isReference(); + } + + @Override + public double score(final byte[] bases, final int offset, final int length) { + if (length == 0) + return 0; + final byte[] vertexSequence = vertex.getSequence(); + if (length < vertexSequence.length) // query is not long enough to have any score. + return Double.NaN; + else if (!Utils.equalRange(vertexSequence,0,bases,offset,vertexSequence.length)) + return Double.NaN; + else + return edgeScore + childFinder.score(bases,offset + vertexSequence.length,length - vertexSequence.length); + } + } + + @Override + public String id() { + return "v" + vertex.getId(); } /** @@ -129,13 +242,14 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder { */ private static class ChildKBestSubHaplotype extends KBestHaplotype { - private final int score; + private final double score; private final KBestHaplotype child; private final SeqVertex vertex; private final boolean isReference; - public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child) { - this.score = edge.getMultiplicity() + child.score(); + + public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child, final double edgeScore) { + this.score = edgeScore + child.score(); this.vertex = vertex; this.child = child; this.isReference = edge.isRef() && child.isReference(); @@ -147,7 +261,7 @@ class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder { } @Override - public int score() { + public double score() { return score; } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index c8c6abb86..087c07be7 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -291,16 +291,9 @@ public class SeqGraph extends BaseGraph { final SeqVertex addedVertex = mergeLinearChainVertices(linearChain); addVertex(addedVertex); - final Set inEdges = incomingEdgesOf(first); - final Set outEdges = outgoingEdgesOf(last); - - final int nEdges = inEdges.size() + outEdges.size(); - int sharedWeightAmongEdges = nEdges == 0 ? 0 : sumEdgeWeightAlongChain(linearChain) / nEdges; - final BaseEdge inc = new BaseEdge(false, sharedWeightAmongEdges); // template to make .add function call easy - // update the incoming and outgoing edges to point to the new vertex - for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy().add(inc)); } - for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy().add(inc)); } + for( final BaseEdge edge : outgoingEdgesOf(last) ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy()); } + for( final BaseEdge edge : incomingEdgesOf(first) ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy()); } removeAllVertices(linearChain); return true; @@ -313,29 +306,6 @@ public class SeqGraph extends BaseGraph { return new SeqVertex( seqsCat ); } - /** - * Get the sum of the edge weights on a linear chain of at least 2 elements - * - * @param chain a linear chain of vertices with at least 2 vertices - * @return the sum of the multiplicities along all edges connecting vertices within the chain - */ - @Requires({"chain != null", "chain.size() >= 2"}) - private int sumEdgeWeightAlongChain(final LinkedList chain) { - int sum = 0; - SeqVertex prev = null; - - for ( final SeqVertex v : chain ) { - if ( prev != null ) { - final BaseEdge e = getEdge(prev, v); - if ( e == null ) throw new IllegalStateException("Something wrong with the linear chain, got a null edge between " + prev + " and " + v); - sum += e.getMultiplicity(); - } - prev = v; - } - - return sum; - } - /** * Base class for transformation operations that need to iterate over proposed vertices, where * each proposed vertex is a seed vertex for a potential transformation. diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java index 284062749..401cbf18c 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java @@ -247,12 +247,12 @@ public class SharedVertexSequenceSplitter { if ( needPrefixNode ) { outer.addVertex(prefixV); - if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0)); + if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 1)); } if ( needSuffixNode ) { outer.addVertex(suffixV); - if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0)); + if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 1)); } if ( topForConnect != null ) { diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index a7989ac2c..e03e26e0a 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -52,7 +52,6 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.jgrapht.alg.CycleDetector; import java.io.File; import java.util.*; @@ -88,8 +87,7 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme /** * */ - - final boolean debugGraphTransformations; + private final boolean debugGraphTransformations; final byte minBaseQualityToUseInAssembly; protected boolean increaseCountsBackwards = true; @@ -319,13 +317,6 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme removeAllVertices(verticesToRemove); } - /** - * @return true if the graph has cycles, false otherwise - */ - public boolean hasCycles() { - return new CycleDetector<>(this).detectCycles(); - } - /** * Does the graph not have enough complexity? We define low complexity as a situation where the number * of non-unique kmers is more than 20% of the total number of kmers. @@ -419,39 +410,10 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme return counter.getKmersWithCountsAtLeast(2); } - /** - * Convert this kmer graph to a simple sequence graph. - * - * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer - * graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence - * - * @return a newly allocated SequenceGraph - */ - // TODO -- should override base class method + @Override public SeqGraph convertToSequenceGraph() { buildGraphIfNecessary(); - - final SeqGraph seqGraph = new SeqGraph(kmerSize); - final Map vertexMap = new HashMap<>(); - - - // create all of the equivalent seq graph vertices - for ( final MultiDeBruijnVertex dv : vertexSet() ) { - final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv))); - sv.setAdditionalInfo(dv.additionalInfo()); - vertexMap.put(dv, sv); - seqGraph.addVertex(sv); - } - - // walk through the nodes and connect them to their equivalent seq vertices - for( final MultiSampleEdge e : edgeSet() ) { - final SeqVertex seqInV = vertexMap.get(getEdgeSource(e)); - final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e)); - //logger.info("Adding edge " + seqInV + " -> " + seqOutV); - seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity())); - } - - return seqGraph; + return super.convertToSequenceGraph(); } private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers, @@ -749,15 +711,15 @@ public class ReadThreadingGraph extends DanglingChainMergingGraph implements Kme } private static String pathElementId(final String element) { - final int parentesysPos = element.indexOf('('); + final int openBracketPosition = element.indexOf('('); - if (parentesysPos == -1) + if (openBracketPosition == -1) return null; - final int closeParentesysPos = element.lastIndexOf(')'); - if (closeParentesysPos == -1) + final int closeBracketPosition = element.lastIndexOf(')'); + if (closeBracketPosition == -1) throw new IllegalArgumentException("non-closed id parantesys found in element: " + element); - final String result = element.substring(parentesysPos + 1,closeParentesysPos).trim(); + final String result = element.substring(openBracketPosition + 1,closeBracketPosition).trim(); if (result.isEmpty()) throw new IllegalArgumentException("empty id found in element: " + element); return result; diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index ac0f5671c..24e47879d 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -94,7 +94,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "f50e0b35e2240b19b1b8b6dfa0cf9796"); + "5ac3bfe1da1d411b52a98ef3debbd318"); } private void HCTestComplexConsensusMode(String bam, String args, String md5) { @@ -106,7 +106,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleConsensusModeComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", - "21e521d51b826450d348e5201684ffe4"); + "61972c7c0d378e756f3b4d99aed9d0cf"); } } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 8ca67f31d..bde1e7ef5 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -68,8 +68,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50323a284788c8220c9226037c7003b5"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7c16aa8e35de9f418533efac3bae6551"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "7e1e193d70187774f9740d475e0f1cc1"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "96fea2caf0a40df3feb268e8b14da670"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "19efc8020f31d1b68d80c50df0629e50"}); tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"}); tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "d926d653500a970280ad7828d9ee2b84"}); tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "83ddc16e4f0900429b2da30e582994aa"}); diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 386fc3800..8db4d5066 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -227,7 +227,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("0998be22d7af4372247f5a0338f9446b")); + Arrays.asList("7c3254ead383e2b9a51b242f6de2a5b2")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGSGraphBased() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("1aeed297a3cb41940d83eac499a2ce07")); + Arrays.asList("eda8f91091fe462205d687ec49fc61e7")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -276,7 +276,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestAggressivePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("f426f4c2986e1dea8f3f55951ef8e013")); + Arrays.asList("73c52372a1a80f052ea2b728ee17bf22")); executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); } @@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestConservativePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("dcb38cb9280f2c3059a09d323db1c633")); + Arrays.asList("4e10d49b8af23d5ef3a28cb702d10a4b")); executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); } @@ -298,4 +298,25 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { spec.disableShadowBCF(); executeTest("testGraphBasedNoSuchEdgeBugFix", spec); } + + @Test + public void testLackSensitivityDueToBadHaplotypeSelectionFix() { + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ", + b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list", + HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e2e6647f7c96e91aeead7301017dc800")); + spec.disableShadowBCF(); + executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec); + } + + @Test + public void testBadLikelihoodsDueToBadHaplotypeSelectionFix() { + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ", + hg19RefereneWithChrPrefixInChromosomeNames, privateTestDir + "bad-likelihoods.bam", privateTestDir + "bad-likelihoods.interval_list", + HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbda30145523bf05e0413157f1a00b3e")); + spec.disableShadowBCF(); + executeTest("testBadLikelihoodsDueToBadHaplotypeSelectionFix", spec); + } + } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index 23513f314..b801f05a9 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -61,7 +60,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { List tests = new ArrayList<>(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "1f463bf3a06c401006858bc446ecea54"}); + tests.add(new Object[]{nct, "fd9324a574f9204f7308fc1af422fdcc"}); } return tests.toArray(new Object[][]{}); diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java index 0ddf7544d..da1474db1 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -52,10 +52,11 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.io.IOException; import java.util.*; public class CommonSuffixMergerUnitTest extends BaseTest { - private final static boolean PRINT_GRAPHS = true; + private final static boolean PRINT_GRAPHS = false; @DataProvider(name = "CompleteCycleData") public Object[][] makeCompleteCycleData() { @@ -134,11 +135,35 @@ public class CommonSuffixMergerUnitTest extends BaseTest { return toUse.toArray(new Object[][]{}); } + /** + * Compares KBestHaplotype solutions, first by the haplotype base sequence and the by their score. + */ + private static final Comparator KBESTHAPLOTYPE_COMPARATOR = new Comparator() { + + /** + * Compares KBestHaplotype solutions, first by the haplotype base sequence and the by their score. + * + * @return {@inheritDoc} + */ + @Override + public int compare(final KBestHaplotype o1,final KBestHaplotype o2) { + final int baseCmp = o1.haplotype().getBaseString().compareTo(o2.haplotype().getBaseString()); + if (baseCmp != 0) + return baseCmp; + return - Double.compare(o1.score(),o2.score()); + } + }; + + public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) { + final KBestHaplotypeFinder originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks()); + final KBestHaplotypeFinder actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks()); + final List sortedOriginalKBestHaplotypes = new ArrayList<>(originalKBestHaplotypes); + Collections.sort(sortedOriginalKBestHaplotypes, KBESTHAPLOTYPE_COMPARATOR); + final List sortedActualKBestHaplotypes = new ArrayList<>(actualKBestHaplotypes); + Collections.sort(sortedActualKBestHaplotypes, KBESTHAPLOTYPE_COMPARATOR); try { final Set haplotypes = new HashSet(); - final List originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks()); - final List actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks()); for (final KBestHaplotype kbh : originalKBestHaplotypes) haplotypes.add(new String(kbh.bases())); @@ -148,14 +173,16 @@ public class CommonSuffixMergerUnitTest extends BaseTest { Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); } - if ( actualKBestHaplotypes.size() == originalKBestHaplotypes.size() ) { - for ( int i = 0; i < originalKBestHaplotypes.size(); i++ ) { - Assert.assertTrue(actualKBestHaplotypes.get(i).haplotype().getBaseString().equals(originalKBestHaplotypes.get(i).haplotype().getBaseString()), "Paths not equal " + actualKBestHaplotypes.get(i).haplotype() + " vs. original " + originalKBestHaplotypes.get(i).haplotype()); - } - } + Assert.assertEquals(sortedActualKBestHaplotypes,sortedOriginalKBestHaplotypes); } catch ( AssertionError e ) { if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0); if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0); + try { + if ( PRINT_GRAPHS ) originalKBestHaplotypes.printDOTFile(String.format("%s.original.finder.dot",name)); + if ( PRINT_GRAPHS ) actualKBestHaplotypes.printDOTFile(String.format("%s.actual.finder.dot",name)); + } catch (IOException e2) { + // do nothing. + } throw e; } } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java index 6dc3d5d67..26c511b6e 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java @@ -65,7 +65,6 @@ import java.util.*; * User: rpoplin * Date: 1/31/13 */ - public class KBestHaplotypeFinderUnitTest extends BaseTest { @DataProvider(name = "BasicPathFindingData") @@ -113,11 +112,11 @@ public class KBestHaplotypeFinderUnitTest extends BaseTest { final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * nEndNodes; Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths"); - int lastScore = Integer.MAX_VALUE; + double lastScore = 0; for ( final KBestHaplotype kbh : paths ) { final Path path = kbh.path(); - Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order. Path " + path + " has score above previous " + lastScore); - lastScore = path.getScore(); + Assert.assertTrue(kbh.score() <= lastScore, "Paths out of order. Path " + path + " has score " + path.getScore() + " above previous " + lastScore); + lastScore = kbh.score(); } // get the best path, and make sure it's the same as our optimal path overall diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index 2f44129d8..eb0432769 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.testng.Assert; @@ -226,28 +227,34 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { } final Set haplotypes = new HashSet<>(); - final List originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks()); + final KBestHaplotypeFinder originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks()); for ( final KBestHaplotype path : originalPaths ) haplotypes.add(new String(path.bases())); final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); splitter.split(); - if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0); - if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".original.dot"), 0); + if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".split.dot"), 0); splitter.updateGraph(top, bot); - if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".updated.dot"), 0); - final List splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks()); + final KBestHaplotypeFinder splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks()); for ( final KBestHaplotype path : splitPaths ) { final String h = new String(path.bases()); Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); } - if ( splitPaths.size() == originalPaths.size() ) { - for ( int i = 0; i < originalPaths.size(); i++ ) { - Assert.assertTrue(splitPaths.get(i).path().equalScoreAndSequence(originalPaths.get(i).path()), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); - } - } + + final List sortedOriginalPaths = new ArrayList<>(originalPaths.size()); + for (final KBestHaplotype kbh : originalPaths.unique()) + sortedOriginalPaths.add(kbh.bases()); + Collections.sort(sortedOriginalPaths, BaseUtils.BASES_COMPARATOR); + final List sortedSplitPaths = new ArrayList<>(splitPaths.size()); + for (final KBestHaplotype kbh : splitPaths.unique()) + sortedSplitPaths.add(kbh.bases()); + Collections.sort(sortedSplitPaths, BaseUtils.BASES_COMPARATOR); + + Assert.assertEquals(sortedSplitPaths,sortedOriginalPaths,Utils.join("_", strings) + "_" + hasTop + "_" + hasBot); } @DataProvider(name = "MeetsMinSequenceData") diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java index c61e9fdfe..0fa277b8b 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; +import java.util.Comparator; /** * BaseUtils contains some basic utilities for manipulating nucleotides. @@ -589,4 +590,26 @@ public class BaseUtils { throw new ReviewedStingException("base must be A, C, G or T. " + (char) base + " is not a valid base."); } } + + + /** + * Lexicographical sorting of base arrays {@link Comparator}. + */ + public static final Comparator BASES_COMPARATOR = new Comparator (){ + + @Override + public int compare(final byte[] o1,final byte[] o2) { + final int minLength = Math.min(o1.length,o2.length); + for (int i = 0; i < minLength; i++) { + final int cmp = Byte.compare(o1[i],o2[i]); + if (cmp != 0) return cmp; + } + if (o1.length == o2.length) + return 0; + else if (o1.length == minLength) + return -1; + else + return 1; + } + }; } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Utils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Utils.java index 9657bc403..487c65483 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Utils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Utils.java @@ -852,4 +852,34 @@ public class Utils { return lst; } + + /** + * Compares sections from to byte arrays to verify whether they contain the same values. + * + * @param left first array to compare. + * @param leftOffset first position of the first array to compare. + * @param right second array to compare. + * @param rightOffset first position of the second array to compare. + * @param length number of positions to compare. + * + * @throws IllegalArgumentException if
    + *
  • either {@code left} or {@code right} is {@code null} or
  • + *
  • any off the offset or length combine point outside any of the two arrays
  • + *
+ * @return {@code true} iff {@code length} is 0 or all the bytes in both ranges are the same two-by-two. + */ + public static boolean equalRange(final byte[] left, final int leftOffset, byte[] right, final int rightOffset, final int length) { + if (left == null) throw new IllegalArgumentException("left cannot be null"); + if (right == null) throw new IllegalArgumentException("right cannot be null"); + if (length < 0) throw new IllegalArgumentException("the length cannot be negative"); + if (leftOffset < 0) throw new IllegalArgumentException("left offset cannot be negative"); + if (leftOffset + length > left.length) throw new IllegalArgumentException("length goes beyond end of left array"); + if (rightOffset < 0) throw new IllegalArgumentException("right offset cannot be negative"); + if (rightOffset + length > right.length) throw new IllegalArgumentException("length goes beyond end of right array"); + + for (int i = 0; i < length; i++) + if (left[leftOffset + i] != right[rightOffset + i]) + return false; + return true; + } } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/Haplotype.java index a381e1b2f..0c4c13840 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/Haplotype.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -49,7 +49,7 @@ public class Haplotype extends Allele { private EventMap eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; - private double score = 0; + private double score = Double.NaN; /** * Main constructor @@ -301,7 +301,7 @@ public class Haplotype extends Allele { * @return a double, where higher values are better */ public double getScore() { - return this.isReference() ? Double.MAX_VALUE : score; + return score; } /** @@ -312,7 +312,7 @@ public class Haplotype extends Allele { * @param score a double, where higher values are better */ public void setScore(double score) { - this.score = this.isReference() ? Double.MAX_VALUE : score; + this.score = score; } /** diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java index 809a6f1a1..08eb0f5df 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java @@ -91,6 +91,7 @@ public abstract class BaseTest { //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; + public static final String hg19RefereneWithChrPrefixInChromosomeNames = "/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta"; public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/"; diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java index e38d472eb..4b25d1d97 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java @@ -26,9 +26,16 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.testng.Assert; -import org.testng.annotations.Test; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Random; public class BaseUtilsUnitTest extends BaseTest { @@ -123,4 +130,50 @@ public class BaseUtilsUnitTest extends BaseTest { Assert.assertTrue(rcObs.equals(rcExp)); } + + @Test(dataProvider="baseComparatorData") + public void testBaseComparator(final Collection basesToSort) { + final ArrayList sorted = new ArrayList<>(basesToSort); + Collections.sort(sorted, BaseUtils.BASES_COMPARATOR); + for (int i = 0; i < sorted.size(); i++) { + Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(i)),0); + final String iString = new String(sorted.get(i)); + for (int j = i; j < sorted.size(); j++) { + final String jString = new String(sorted.get(j)); + if (iString.compareTo(jString) == 0) + Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)),0); + else + Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) * iString.compareTo(jString) > 0); + Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) <= 0); + } + } + } + + @DataProvider(name="baseComparatorData") + public Object[][] baseComparatorData() { + final int testCount = 10; + final int testSizeAverage = 10; + final int testSizeDeviation = 10; + final int haplotypeSizeAverage = 100; + final int haplotypeSizeDeviation = 100; + + final Object[][] result = new Object[testCount][]; + + GenomeAnalysisEngine.resetRandomGenerator(); + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + + for (int i = 0; i < testCount; i++) { + final int size = (int) Math.max(0,rnd.nextDouble() * testSizeDeviation + testSizeAverage); + final ArrayList bases = new ArrayList<>(size); + for (int j = 0; j < size; j++) { + final int jSize = (int) Math.max(0,rnd.nextDouble() * haplotypeSizeDeviation + haplotypeSizeAverage); + final byte[] b = new byte[jSize]; + for (int k = 0; k < jSize; k++) + b[k] = BaseUtils.baseIndexToSimpleBase(rnd.nextInt(4)); + bases.add(b); + } + result[i] = new Object[] { bases }; + } + return result; + } } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/UtilsUnitTest.java index 0a6f9898e..620c9577c 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/UtilsUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/UtilsUnitTest.java @@ -236,4 +236,23 @@ public class UtilsUnitTest extends BaseTest { public void testTrim(final String s, final int frontTrim, final int backTrim) { Assert.assertEquals(s.length() - frontTrim - backTrim, Utils.trimArray(s.getBytes(), frontTrim, backTrim).length); } + + @Test(dataProvider = "equalRange", enabled = true) + public void testEqualRange(final byte[] array1, final byte[] array2, final int offset1, final int offset2, final int length, final boolean expected) { + Assert.assertEquals(Utils.equalRange(array1,offset1,array2,offset2,length),expected); + Assert.assertTrue(Utils.equalRange(array1,offset1,array1,offset1,length)); + Assert.assertTrue(Utils.equalRange(array2,offset2,array2,offset2,length)); + + } + + @DataProvider(name = "equalRangeData") + public Object[][] equalRangeData() { + return new Object[][] { + new Object[] { new byte[0] , new byte[0], 0, 0, 0, true}, + new Object[] { "ABCF".getBytes(), "BC".getBytes(), 1,0,2, true }, + new Object[] { "ABCF".getBytes(), "".getBytes(), 1,0,0, true }, + new Object[] { "ABCF".getBytes(), "ACBF".getBytes(), 0,0, 4, false} + }; + + } }