From 66910b036c628e858defb8acde4ea26bfec31d0a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Mar 2013 18:37:25 -0400 Subject: [PATCH] Added new and improved suffix and node merging algorithms -- These new algorithms are more powerful than the restricted diamond merging algoriths, in that they can merge nodes with multiple incoming and outgoing edges. Together the splitter + merger algorithms will correctly merge many more cases than the original headless and tailless diamond merger. -- Refactored haplotype caller infrastructure into graphs package, code cleanup -- Cleanup new merging / splitting algorithms, with proper docs and unit tests -- Fix bug in zipping of linear chains. Because the multiplicity can be 0, protect ourselves with a max function call -- Fix BaseEdge.max unit test -- Add docs and some more unit tests -- Move error correct from DeBruijnGraph to DeBruijnAssembler -- Replaced uses of System.out.println with logger.info -- Don't make multiplicity == 0 nodes look like they should be pruned -- Fix toString of Path --- .../haplotypecaller/DeBruijnAssembler.java | 49 ++++- .../haplotypecaller/GenotypingEngine.java | 20 +- .../haplotypecaller/HaplotypeCaller.java | 4 +- .../LikelihoodCalculationEngine.java | 6 +- .../{ => graphs}/BaseEdge.java | 6 +- .../{ => graphs}/BaseGraph.java | 25 +-- .../{ => graphs}/BaseGraphIterator.java | 2 +- .../{ => graphs}/BaseVertex.java | 2 +- .../graphs/CommonSuffixSplitter.java | 182 ++++++++++++++++++ .../{ => graphs}/DeBruijnGraph.java | 38 +--- .../{ => graphs}/DeBruijnVertex.java | 2 +- .../{ => graphs}/KBestPaths.java | 2 +- .../haplotypecaller/{ => graphs}/Path.java | 7 +- .../{ => graphs}/SeqGraph.java | 78 +++++++- .../{ => graphs}/SeqVertex.java | 3 +- .../graphs/SharedSequenceMerger.java | 138 +++++++++++++ .../SharedVertexSequenceSplitter.java | 47 +---- .../walkers/haplotypecaller/graphs/Utils.java | 138 +++++++++++++ .../DeBruijnAssemblerUnitTest.java | 1 + .../DeBruijnAssemblyGraphUnitTest.java | 1 + .../{ => graphs}/BaseEdgeUnitTest.java | 18 +- .../{ => graphs}/BaseGraphUnitTest.java | 61 +++++- .../{ => graphs}/BaseVertexUnitTest.java | 2 +- .../graphs/CommonSuffixMergerUnitTest.java | 160 +++++++++++++++ .../graphs/CommonSuffixSplitterUnitTest.java | 113 +++++++++++ .../{ => graphs}/DeBruijnVertexUnitTest.java | 2 +- .../{ => graphs}/KBestPathsUnitTest.java | 2 +- .../{ => graphs}/SeqGraphUnitTest.java | 48 ++--- .../{ => graphs}/SeqVertexUnitTest.java | 3 +- .../SharedVertexSequenceSplitterUnitTest.java | 6 +- 30 files changed, 1001 insertions(+), 165 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseEdge.java (98%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseGraph.java (97%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseGraphIterator.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseVertex.java (99%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/DeBruijnGraph.java (88%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/DeBruijnVertex.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/KBestPaths.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/Path.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqGraph.java (86%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqVertex.java (99%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SharedVertexSequenceSplitter.java (91%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseEdgeUnitTest.java (92%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseGraphUnitTest.java (86%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseVertexUnitTest.java (99%) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/DeBruijnVertexUnitTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/KBestPathsUnitTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqGraphUnitTest.java (97%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqVertexUnitTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SharedVertexSequenceSplitterUnitTest.java (98%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index e9961519c..198abeac8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -53,6 +53,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -160,7 +161,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); if ( shouldErrorCorrectKmers() ) { - graph = graph.errorCorrect(); + graph = errorCorrect(graph); if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor); } @@ -189,6 +190,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); // now go through and prune the graph, removing vertices no longer connected to the reference chain + // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight + // edges to maintain graph connectivity. seqGraph.pruneGraph(pruneFactor); seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); @@ -203,7 +206,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return null; seqGraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.refcleaned.dot"), pruneFactor); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); return seqGraph; } @@ -321,6 +324,39 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return true; } + /** + * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers + * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason + * we cannot actually do the error correction + */ + public DeBruijnGraph errorCorrect(final DeBruijnGraph graph) { + final KMerErrorCorrector corrector = new KMerErrorCorrector(graph.getKmerSize(), 1, 1, 5); // TODO -- should be static variables + + for( final BaseEdge e : graph.edgeSet() ) { + for ( final byte[] kmer : Arrays.asList(graph.getEdgeSource(e).getSequence(), graph.getEdgeTarget(e).getSequence())) { + // TODO -- need a cleaner way to deal with the ref weight + corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); + } + } + + if ( corrector.computeErrorCorrectionMap() ) { + final DeBruijnGraph correctedGraph = new DeBruijnGraph(graph.getKmerSize()); + + for( final BaseEdge e : graph.edgeSet() ) { + final byte[] source = corrector.getErrorCorrectedKmer(graph.getEdgeSource(e).getSequence()); + final byte[] target = corrector.getErrorCorrectedKmer(graph.getEdgeTarget(e).getSequence()); + if ( source != null && target != null ) { + correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); + } + } + + return correctedGraph; + } else { + // the error correction wasn't possible, simply return this graph + return graph; + } + } + protected void printGraphs(final List graphs) { final int writeFirstGraphWithSizeSmallerThan = 50; @@ -366,6 +402,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { for( final SeqGraph graph : graphs ) { for ( final Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { +// logger.info("Found path " + path); Haplotype h = new Haplotype( path.getBases() ); if( !returnHaplotypes.contains(h) ) { final Cigar cigar = path.calculateCigar(); @@ -421,13 +458,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if( debug ) { if( returnHaplotypes.size() > 1 ) { - System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); } else { - System.out.println("Found only the reference haplotype in the assembly graph."); + logger.info("Found only the reference haplotype in the assembly graph."); } for( final Haplotype h : returnHaplotypes ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); + logger.info( h.toString() ); + logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index cc9d94b1b..0d6e29fe9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -51,6 +51,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; @@ -66,6 +67,7 @@ import java.io.PrintStream; import java.util.*; public class GenotypingEngine { + private final static Logger logger = Logger.getLogger(GenotypingEngine.class); private final boolean DEBUG; private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; @@ -168,15 +170,15 @@ public class GenotypingEngine { // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file final TreeSet startPosKeySet = new TreeSet(); int count = 0; - if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } + if( DEBUG ) { logger.info("=== Best Haplotypes ==="); } for( final Haplotype h : haplotypes ) { // Walk along the alignment and turn any difference from the reference into an event h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); } if( DEBUG ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() ); - System.out.println( ">> Events = " + h.getEventMap()); + logger.info(h.toString()); + logger.info("> Cigar = " + h.getCigar()); + logger.info(">> Events = " + h.getEventMap()); } } @@ -261,7 +263,7 @@ public class GenotypingEngine { final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); if( DEBUG ) { - System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); + logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); //System.out.println("Event/haplotype allele mapping = " + alleleMapper); } @@ -500,9 +502,9 @@ public class GenotypingEngine { if( isBiallelic ) { final double R2 = calculateR2LD( Math.pow(10.0, x11), Math.pow(10.0, x12), Math.pow(10.0, x21), Math.pow(10.0, x22) ); if( DEBUG ) { - System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2)); - System.out.println("-- " + thisVC); - System.out.println("-- " + nextVC); + logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2)); + logger.info("-- " + thisVC); + logger.info("-- " + nextVC); } if( R2 > MERGE_EVENTS_R2_THRESHOLD ) { @@ -528,7 +530,7 @@ public class GenotypingEngine { if(!containsStart) { startPosKeySet.remove(thisStart); } if(!containsNext) { startPosKeySet.remove(nextStart); } - if( DEBUG ) { System.out.println("====> " + mergedVC); } + if( DEBUG ) { logger.info("====> " + mergedVC); } mapWasUpdated = true; break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 7e2211502..ca105fe03 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -581,7 +581,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem haplotypeBAMWriter.writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, bestHaplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); } - if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } + if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } return 1; // One active region was processed during this map call } @@ -614,7 +614,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem //--------------------------------------------------------------------------------------------------------------- private void finalizeActiveRegion( final ActiveRegion activeRegion ) { - if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } + if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final List finalizedReadList = new ArrayList(); final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); activeRegion.clearReads(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 87b488b3e..51483c53f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -62,6 +63,7 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.*; public class LikelihoodCalculationEngine { + private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class); private static final double LOG_ONE_HALF = -Math.log10(2.0); private final byte constantGCP; @@ -256,14 +258,14 @@ public class LikelihoodCalculationEngine { } } if( maxElement == Double.NEGATIVE_INFINITY ) { break; } - if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } + if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY; if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); } if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); } } - if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } + if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } final List bestHaplotypes = new ArrayList(); for( final int hIndex : bestHaplotypesIndexList ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java similarity index 98% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java index 07a6629d7..6076626f6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import java.io.Serializable; import java.util.Collection; @@ -69,7 +69,7 @@ public class BaseEdge { * @param multiplicity the number of observations of this edge */ public BaseEdge(final boolean isRef, final int multiplicity) { - if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0"); + if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0 but got " + multiplicity); this.multiplicity = multiplicity; this.isRef = isRef; @@ -176,7 +176,7 @@ public class BaseEdge { } /** - * Return a new edge that the max of this and edge + * Return a new edge whose multiplicity is the max of this and edge, and isRef is or of this and edge * * isRef is simply the or of this and edge * multiplicity is the max diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java similarity index 97% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 9872a370b..1d294e591 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; @@ -53,7 +53,6 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; -import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; @@ -221,15 +220,6 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); if( edge.isRef() ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); } @@ -414,7 +404,12 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph edgesToRemove = new ArrayList(); for( final BaseEdge e : edgeSet() ) { if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor @@ -499,7 +494,7 @@ public class BaseGraph extends DefaultDirectedGraph toMerge = graph.incomingVerticesOf(v); + if ( toMerge.size() < 2 ) + // Can only split at least 2 vertices + return false; + else if ( ! safeToSplit(graph, v, toMerge) ) { + return false; + } else { + final SeqVertex suffixVTemplate = commonSuffix(toMerge); + if ( suffixVTemplate.isEmpty() ) { + return false; + } else { + final List edgesToRemove = new LinkedList(); + +// graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0); + for ( final SeqVertex mid : toMerge ) { + // create my own copy of the suffix + final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence()); + graph.addVertex(suffixV); + final SeqVertex prefixV = mid.withoutSuffix(suffixV.getSequence()); + final BaseEdge out = graph.outgoingEdgeOf(mid); + + final SeqVertex incomingTarget; + if ( prefixV == null ) { + // this node is entirely explained by suffix + incomingTarget = suffixV; + } else { + incomingTarget = prefixV; + graph.addVertex(prefixV); + graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0)); + edgesToRemove.add(out); + } + + graph.addEdge(suffixV, graph.getEdgeTarget(out), new BaseEdge(out)); + + for ( final BaseEdge in : graph.incomingEdgesOf(mid) ) { + graph.addEdge(graph.getEdgeSource(in), incomingTarget, new BaseEdge(in)); + edgesToRemove.add(in); + } + } + + graph.removeAllVertices(toMerge); + graph.removeAllEdges(edgesToRemove); +// graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0); + + return true; + } + } + } + +// private static int counter = 0; + + /** + * Can we safely split up the vertices in toMerge? + * + * @param graph a graph + * @param bot a vertex whose incoming vertices we want to split + * @param toMerge the set of vertices we'd be splitting up + * @return true if we can safely split up toMerge + */ + private boolean safeToSplit(final SeqGraph graph, final SeqVertex bot, final Collection toMerge) { + for ( final SeqVertex m : toMerge ) { + final Set outs = graph.outgoingEdgesOf(m); + if ( m == bot || outs.size() != 1 || ! graph.outgoingVerticesOf(m).contains(bot) ) + // m == bot => don't allow cycles in the graph + return false; + } + + return true; + } + + /** + * Return the longest suffix of bases shared among all provided vertices + * + * For example, if the vertices have sequences AC, CC, and ATC, this would return + * a single C. However, for ACC and TCC this would return CC. And for AC and TG this + * would return null; + * + * @param middleVertices a non-empty set of vertices + * @return a single vertex that contains the common suffix of all middle vertices + */ + @Requires("!middleVertices.isEmpty()") + protected static SeqVertex commonSuffix(final Collection middleVertices) { + final List kmers = Utils.getKmers(middleVertices); + final int min = Utils.minKmerLength(kmers); + final int suffixLen = Utils.compSuffixLen(kmers, min); + final byte[] kmer = kmers.get(0); + final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length); + return new SeqVertex(suffix); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java similarity index 88% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java index fd8581254..109598029 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java @@ -44,9 +44,10 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerErrorCorrector; import java.util.Arrays; import java.util.HashMap; @@ -88,39 +89,6 @@ public class DeBruijnGraph extends BaseGraph { } } - /** - * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers - * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason - * we cannot actually do the error correction - */ - protected DeBruijnGraph errorCorrect() { - final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables - - for( final BaseEdge e : edgeSet() ) { - for ( final byte[] kmer : Arrays.asList(getEdgeSource(e).getSequence(), getEdgeTarget(e).getSequence())) { - // TODO -- need a cleaner way to deal with the ref weight - corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); - } - } - - if ( corrector.computeErrorCorrectionMap() ) { - final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize()); - - for( final BaseEdge e : edgeSet() ) { - final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence()); - final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence()); - if ( source != null && target != null ) { - correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); - } - } - - return correctedGraph; - } else { - // the error correction wasn't possible, simply return this graph - return this; - } - } - /** * Add edge to assembly graph connecting the two kmers * @param kmer1 the source kmer for the edge @@ -168,7 +136,7 @@ public class DeBruijnGraph extends BaseGraph { * @return a newly allocated SequenceGraph */ @Ensures({"result != null"}) - protected SeqGraph convertToSequenceGraph() { + public SeqGraph convertToSequenceGraph() { final SeqGraph seqGraph = new SeqGraph(getKmerSize()); final Map vertexMap = new HashMap(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java index 0a2c26ca4..4d9441efe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java index 0724729a8..1dc712c67 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.common.collect.MinMaxPriorityQueue; import com.google.java.contract.Ensures; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index 269adcc22..50ca91d41 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; @@ -67,7 +67,7 @@ import java.util.*; * Time: 2:34 PM * */ -class Path { +public class Path { private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20; // the last vertex seen in the path @@ -163,8 +163,9 @@ class Path { boolean first = true; for ( final T v : getVertices() ) { if ( first ) { - b.append(" -> "); first = false; + } else { + b.append(" -> "); } b.append(v.getSequenceString()); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java similarity index 86% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index da24a06a4..400b5c7ee 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -44,10 +44,12 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.apache.commons.lang.ArrayUtils; +import java.io.File; +import java.util.HashSet; import java.util.Set; /** @@ -57,6 +59,7 @@ import java.util.Set; * @since 03/2013 */ public class SeqGraph extends BaseGraph { + private final static boolean PRINT_SIMPLIFY_GRAPHS = false; private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5; /** @@ -97,9 +100,16 @@ public class SeqGraph extends BaseGraph { //logger.info("simplifyGraph iteration " + i); // iterate until we haven't don't anything useful didSomeWork = false; - //printGraph(new File("simplifyGraph." + i + ".dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".dot"), 0); didSomeWork |= new MergeDiamonds().transformUntilComplete(); didSomeWork |= new MergeTails().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".diamonds_and_tails.dot"), 0); + + didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".split_suffix.dot"), 0); + didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".merge_suffix.dot"), 0); + didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); didSomeWork |= zipLinearChains(); i++; @@ -109,7 +119,7 @@ public class SeqGraph extends BaseGraph { /** * Zip up all of the simple linear chains present in this graph. */ - protected boolean zipLinearChains() { + public boolean zipLinearChains() { boolean foundOne = false; while( zipOneLinearChain() ) { // just keep going until zipOneLinearChain says its done @@ -137,13 +147,16 @@ public class SeqGraph extends BaseGraph { final Set outEdges = outgoingEdgesOf(outgoingVertex); final Set inEdges = incomingEdgesOf(incomingVertex); + final BaseEdge singleOutEdge = outEdges.isEmpty() ? null : outEdges.iterator().next(); + final BaseEdge singleInEdge = inEdges.isEmpty() ? null : inEdges.iterator().next(); + if( inEdges.size() == 1 && outEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + singleInEdge.setMultiplicity( singleInEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + singleOutEdge.setMultiplicity( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) ); } else if( inEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + singleInEdge.setMultiplicity( Math.max(singleInEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) ); } else if( outEdges.size() == 1 ) { - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + singleOutEdge.setMultiplicity( Math.max( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) ); } final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); @@ -297,6 +310,57 @@ public class SeqGraph extends BaseGraph { } } + /** + * Merge headless configurations: + * + * Performs the transformation: + * + * { x + S_i + y -> Z } + * + * goes to: + * + * { x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no top node is required + */ + protected class MergeCommonSuffices extends VertexBasedTransformer { + @Override + boolean tryToTransform(final SeqVertex bottom) { + return new SharedSequenceMerger().merge(SeqGraph.this, bottom); + } + } + + /** + * Merge headless configurations: + * + * Performs the transformation: + * + * { x + S_i + y -> Z } + * + * goes to: + * + * { x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no top node is required + */ + protected class SplitCommonSuffices extends VertexBasedTransformer { + final Set alreadySplit = new HashSet(); + + @Override + boolean tryToTransform(final SeqVertex bottom) { + if ( alreadySplit.contains(bottom) ) + return false; + else { + alreadySplit.add(bottom); + return new CommonSuffixSplitter().split(SeqGraph.this, bottom); + } + } + } + /** * Merge headless configurations: * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java index 523312dcf..cfc2abfdc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java @@ -44,11 +44,10 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.Utils; - import java.util.Arrays; /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java new file mode 100644 index 000000000..1c53f2332 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java @@ -0,0 +1,138 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.apache.commons.lang.ArrayUtils; + +import java.util.*; + +/** + * Merges the incoming vertices of a vertex V of a graph + * + * Looks at the vertices that are incoming to V (i.e., have an outgoing edge connecting to V). If + * they all have the same sequence, merges them into the sequence of V, and updates the graph + * as appropriate + * + * User: depristo + * Date: 3/22/13 + * Time: 8:31 AM + */ +public class SharedSequenceMerger { + public SharedSequenceMerger() { } + + /** + * Attempt to merge the incoming vertices of v + * + * @param graph the graph containing the vertex v + * @param v the vertex whose incoming vertices we want to merge + * @return true if some useful merging was done, false otherwise + */ + public boolean merge(final SeqGraph graph, final SeqVertex v) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex " + v); + + final Set prevs = graph.incomingVerticesOf(v); + if ( ! canMerge(graph, v, prevs) ) + return false; + else { +// graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0); + + final List edgesToRemove = new LinkedList(); + final byte[] prevSeq = prevs.iterator().next().getSequence(); + final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence())); + graph.addVertex(newV); + + for ( final SeqVertex prev : prevs ) { + for ( final BaseEdge prevIn : graph.incomingEdgesOf(prev) ) { + graph.addEdge(graph.getEdgeSource(prevIn), newV, new BaseEdge(prevIn)); + edgesToRemove.add(prevIn); + } + } + + for ( final BaseEdge e : graph.outgoingEdgesOf(v) ) { + graph.addEdge(newV, graph.getEdgeTarget(e), new BaseEdge(e)); + } + + graph.removeAllVertices(prevs); + graph.removeVertex(v); + graph.removeAllEdges(edgesToRemove); + +// graph.printGraph(new File("csm." + counter++ + "." + v.getSequenceString() + "_post.dot"), 0); + + return true; + } + } + + //private static int counter = 0; + + /** + * Can we safely merge the incoming vertices of v + * + * @param graph the graph containing v and incomingVertices + * @param v the vertex we want to merge into + * @param incomingVertices the incoming vertices of v + * @return true if we can safely merge incomingVertices + */ + private boolean canMerge(final SeqGraph graph, final SeqVertex v, final Collection incomingVertices) { + if ( incomingVertices.isEmpty() ) + return false; + + final SeqVertex first = incomingVertices.iterator().next(); + for ( final SeqVertex prev : incomingVertices) { + if ( ! prev.seqEquals(first) ) + return false; + final Collection prevOuts = graph.outgoingVerticesOf(prev); + if ( prevOuts.size() != 1 ) + return false; + if ( prevOuts.iterator().next() != v ) + return false; + } + + return true; + } + +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java similarity index 91% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java index e0501da52..9834653a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; @@ -266,8 +266,8 @@ public class SharedVertexSequenceSplitter { min = Math.min(min, v.getSequence().length); } - final int prefixLen = compPrefixLen(kmers, min); - final int suffixLen = compSuffixLen(kmers, min - prefixLen); + final int prefixLen = Utils.compPrefixLen(kmers, min); + final int suffixLen = Utils.compSuffixLen(kmers, min - prefixLen); final byte[] kmer = kmers.get(0); final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen); @@ -275,47 +275,6 @@ public class SharedVertexSequenceSplitter { return new Pair(new SeqVertex(prefix), new SeqVertex(suffix)); } - /** - * Compute the maximum shared prefix length of list of bytes. - * - * @param listOfBytes a list of bytes with at least one element - * @param minLength the min. length among all byte[] in listOfBytes - * @return the number of shared bytes common at the start of all bytes - */ - @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) - @Ensures("result >= 0") - protected static int compPrefixLen(final List listOfBytes, final int minLength) { - for ( int i = 0; i < minLength; i++ ) { - final byte b = listOfBytes.get(0)[i]; - for ( int j = 1; j < listOfBytes.size(); j++ ) { - if ( b != listOfBytes.get(j)[i] ) - return i; - } - } - - return minLength; - } - - /** - * Compute the maximum shared suffix length of list of bytes. - * - * @param listOfBytes a list of bytes with at least one element - * @param minLength the min. length among all byte[] in listOfBytes - * @return the number of shared bytes common at the end of all bytes - */ - @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) - @Ensures("result >= 0") - protected static int compSuffixLen(final List listOfBytes, final int minLength) { - for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) { - final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1]; - for ( int j = 1; j < listOfBytes.size(); j++ ) { - if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] ) - return suffixLen; - } - } - return minLength; - } - /** * Helper function that returns an edge that we should use for splitting * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java new file mode 100644 index 000000000..8cb272925 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java @@ -0,0 +1,138 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Utility functions used in the graphs package + * + * User: depristo + * Date: 3/25/13 + * Time: 9:42 PM + */ +final class Utils { + private Utils() {} + + /** + * Compute the maximum shared prefix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the start of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compPrefixLen(final List listOfBytes, final int minLength) { + for ( int i = 0; i < minLength; i++ ) { + final byte b = listOfBytes.get(0)[i]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[i] ) + return i; + } + } + + return minLength; + } + + /** + * Compute the maximum shared suffix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the end of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compSuffixLen(final List listOfBytes, final int minLength) { + for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) { + final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] ) + return suffixLen; + } + } + return minLength; + } + + /** + * Get the list of kmers as byte[] from the vertices in the graph + * + * @param vertices a collection of vertices + * @return a list of their kmers in order of the iterator on vertices + */ + protected static List getKmers(final Collection vertices) { + final List kmers = new ArrayList(vertices.size()); + for ( final SeqVertex v : vertices ) { + kmers.add(v.getSequence()); + } + return kmers; + } + + /** + * Get the minimum length of a collection of byte[] + * + * @param kmers a list of kmers whose .length min we want + * @return the min of the kmers, if kmers is empty the result is 0 + */ + protected static int minKmerLength(final Collection kmers) { + if ( kmers == null ) throw new IllegalArgumentException("kmers cannot be null"); + + if ( kmers.isEmpty() ) return 0; + int min = Integer.MAX_VALUE; + for ( final byte[] kmer : kmers ) { + min = Math.min(min, kmer.length); + } + return min; + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index cce623b76..86d331dae 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -56,6 +56,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java index 2b87cf61d..a13618dae 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java similarity index 92% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java index 3cc44c7de..7df6ee6c8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -102,4 +102,20 @@ public class BaseEdgeUnitTest extends BaseTest { Assert.assertEquals(edges.get(2), e2); Assert.assertEquals(edges.get(3), e1); } + + @Test + public void testMax() { + for ( final boolean firstIsRef : Arrays.asList(true, false) ) { + for ( final boolean secondIsRef : Arrays.asList(true, false) ) { + for ( final int firstMulti : Arrays.asList(1, 4) ) { + for ( final int secondMulti : Arrays.asList(2, 3) ) { + final BaseEdge expected = new BaseEdge(firstIsRef || secondIsRef, Math.max(firstMulti, secondMulti)); + final BaseEdge actual = new BaseEdge(firstIsRef, firstMulti).max(new BaseEdge(secondIsRef, secondMulti)); + Assert.assertEquals(actual.getMultiplicity(), expected.getMultiplicity()); + Assert.assertEquals(actual.isRef(), expected.isRef()); + } + } + } + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java similarity index 86% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java index db4127ddb..9737f72f5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -160,7 +160,7 @@ public class BaseGraphUnitTest extends BaseTest { graph.addEdges(g3, b7, b8, b7); // cycle is bad graph.addEdges(g3, b9, b9); // self-cycle is bad - final boolean debug = true; + final boolean debug = false; if ( debug ) good.printGraph(new File("expected.dot"), 0); if ( debug ) graph.printGraph(new File("bad.dot"), 0); graph.removePathsNotConnectedToRef(); @@ -169,6 +169,63 @@ public class BaseGraphUnitTest extends BaseTest { Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); } + @Test + public void testRemoveVerticesNotConnectedToRefRegardlessOfEdgeDirection() throws Exception { + final SeqGraph graph = new SeqGraph(); + + SeqVertex src = new SeqVertex("A"); + SeqVertex end = new SeqVertex("A"); + SeqVertex g1 = new SeqVertex("C"); + SeqVertex g2 = new SeqVertex("G"); + SeqVertex g3 = new SeqVertex("T"); + SeqVertex g4 = new SeqVertex("AA"); + SeqVertex g5 = new SeqVertex("AA"); + SeqVertex g6 = new SeqVertex("AA"); + SeqVertex g8 = new SeqVertex("AA"); + SeqVertex g7 = new SeqVertex("AA"); + SeqVertex gPrev = new SeqVertex("AA"); + SeqVertex gPrev1 = new SeqVertex("AA"); + SeqVertex gPrev2 = new SeqVertex("AA"); + SeqVertex gAfter = new SeqVertex("AA"); + SeqVertex gAfter1 = new SeqVertex("AA"); + SeqVertex gAfter2 = new SeqVertex("AA"); + SeqVertex b1 = new SeqVertex("CC"); + SeqVertex b2 = new SeqVertex("GG"); + SeqVertex b3 = new SeqVertex("TT"); + SeqVertex b4 = new SeqVertex("AAA"); + SeqVertex b5 = new SeqVertex("CCC"); + SeqVertex b6 = new SeqVertex("GGG"); + + graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8, gPrev, gPrev1, gPrev2, gAfter, gAfter1, gAfter2); + graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end); + graph.addEdges(src, g1, g5, g6, g7, end); + graph.addEdges(src, g1, g5, g8, g7, end); + graph.addEdges(src, g1, g3, end); + + // these should be kept, but are in the wrong direction + graph.addEdges(gPrev, src); + graph.addEdges(gPrev1, gPrev2, src); + graph.addEdges(end, gAfter); + graph.addEdges(end, gAfter1, gAfter2); + + // the current state of the graph is the good one + final SeqGraph good = (SeqGraph)graph.clone(); + + // now add the bads to the graph + graph.addVertices(b1, b2, b3, b4, b5, b6); + graph.addEdges(b2, b3); // b2 -> b3 + graph.addEdges(b4, b5, b4); // cycle + graph.addEdges(b6, b6); // isolated self cycle + + final boolean debug = false; + if ( debug ) good.printGraph(new File("expected.dot"), 0); + if ( debug ) graph.printGraph(new File("bad.dot"), 0); + graph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); + if ( debug ) graph.printGraph(new File("actual.dot"), 0); + + Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); + } + @Test public void testPrintEmptyGraph() throws Exception { final File tmp = File.createTempFile("tmp", "dot"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java index 8f682d474..859892e33 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java new file mode 100644 index 000000000..012add769 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -0,0 +1,160 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class CommonSuffixMergerUnitTest extends BaseTest { + private final static boolean PRINT_GRAPHS = true; + + @DataProvider(name = "CompleteCycleData") + public Object[][] makeCompleteCycleData() { + return makeSplitMergeData(-1); + } + + public static class SplitMergeData { + final SeqGraph graph; + final SeqVertex v; + final String commonSuffix; + + public SplitMergeData(SeqGraph graph, SeqVertex v, String commonSuffix) { + this.graph = graph; + this.v = v; + this.commonSuffix = commonSuffix; + } + } + + public static Object[][] makeSplitMergeData(final int maxTests) { + List tests = new ArrayList(); + + final List bases = Arrays.asList("A", "C", "G", "T"); + for ( final String commonSuffix : Arrays.asList("", "A", "AT") ) { + for ( final int nBots : Arrays.asList(0, 1, 2) ) { + for ( final int nMids : Arrays.asList(1, 2, 3) ) { + for ( int nTops = 0; nTops < nMids; nTops++ ) { + for ( int nTopConnections = 1; nTopConnections <= nMids; nTopConnections++ ) { + int multi = 1; + final SeqGraph graph = new SeqGraph(); + final SeqVertex v = new SeqVertex("GGGG"); + graph.addVertex(v); + + final LinkedList tops = new LinkedList(); + final LinkedList mids = new LinkedList(); + + for ( int i = 0; i < nMids; i++) { + final SeqVertex mid = new SeqVertex(bases.get(i) + commonSuffix); + graph.addVertex(mid); + graph.addEdge(mid, v, new BaseEdge(i == 0, multi++)); + mids.add(mid); + + tops.add(new SeqVertex(bases.get(i))); + } + + graph.addVertices(tops); + for ( final SeqVertex t : tops ) { + for ( int i = 0; i < nTopConnections; i++ ) { + graph.addEdge(t, mids.get(i), new BaseEdge(i == 0, multi++)); + } + } + + for ( int i = 0; i < nBots; i++ ) { + final SeqVertex bot = new SeqVertex(bases.get(i)); + graph.addVertex(bot); + graph.addEdge(v, bot, new BaseEdge(i == 0, multi++)); + + } + + tests.add(new Object[]{new SplitMergeData(graph, v, commonSuffix)}); + } + } + } + } + } + + final List toUse = maxTests == -1 ? tests : tests.subList(0, Math.min(tests.size(), maxTests)); + return toUse.toArray(new Object[][]{}); + } + + public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) { + try { + final Set haplotypes = new HashSet(); + final List> originalPaths = new KBestPaths().getKBestPaths(original); + for ( final Path path : originalPaths ) + haplotypes.add(new String(path.getBases())); + + final List> splitPaths = new KBestPaths().getKBestPaths(actual); + for ( final Path path : splitPaths ) { + final String h = new String(path.getBases()); + Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); + } + + if ( splitPaths.size() == originalPaths.size() ) { + for ( int i = 0; i < originalPaths.size(); i++ ) { + Assert.assertTrue(splitPaths.get(i).equalSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); + } + } + } catch ( AssertionError e ) { + if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0); + if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0); + throw e; + } + } + + @Test(dataProvider = "CompleteCycleData") + public void testMerging(final SplitMergeData data) { + final SeqGraph original = (SeqGraph)data.graph.clone(); + final SharedSequenceMerger splitter = new SharedSequenceMerger(); + splitter.merge(data.graph, data.v); + assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java new file mode 100644 index 000000000..f03dc8762 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java @@ -0,0 +1,113 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class CommonSuffixSplitterUnitTest extends BaseTest { + @DataProvider(name = "SplitData") + public Object[][] makeSplitData() { + return CommonSuffixMergerUnitTest.makeSplitMergeData(-1); + } + + @Test(dataProvider = "SplitData") + public void testSplit(final CommonSuffixMergerUnitTest.SplitMergeData data) { + final boolean expectedMerge = ! data.commonSuffix.isEmpty() && data.graph.inDegreeOf(data.v) > 1; + + final SeqGraph original = (SeqGraph)data.graph.clone(); +// original.printGraph(new File("original.dot"), 0); + final CommonSuffixSplitter splitter = new CommonSuffixSplitter(); + final boolean succeed = splitter.split(data.graph, data.v); +// data.graph.printGraph(new File("actual.dot"), 0); + Assert.assertEquals(succeed, expectedMerge, "Not excepted merge success/fail result"); + if ( succeed ) { + Assert.assertEquals(data.graph.incomingVerticesOf(data.v).iterator().next().getSequenceString(), data.commonSuffix, "Common suffix not computed correctly"); + } + + CommonSuffixMergerUnitTest.assertSameHaplotypes(String.format("suffixSplit.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); + } + + @Test + public void testSplitPrevHaveMultipleEdges() { + final SeqGraph original = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("A"); + final SeqVertex v3 = new SeqVertex("A"); + final SeqVertex v4 = new SeqVertex("A"); + + original.addVertices(v1, v2, v3, v4); + original.addEdges(v1, v3); + + Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with only one vertex"); + + original.addEdges(v2, v3); + original.addEdges(v2, v4); + + Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with multiple outgoing edges from middle nodes"); + } + + @Test + public void testSplitNoCycles() { + final SeqGraph original = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("C"); + final SeqVertex v3 = new SeqVertex("C"); + final SeqVertex v4 = new SeqVertex("G"); + + original.addVertices(v1, v2, v3, v4); + original.addEdges(v1, v3, v4); + original.addEdges(v1, v2, v4); + + Assert.assertTrue(new CommonSuffixSplitter().split((SeqGraph)original.clone(), v4), "Should be able to split pre-cycle graph"); + + original.addEdges(v4, v4); + Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java index dfbe50668..bdc8ab36d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.annotations.Test; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index 34b4ba912..d20a0f778 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java similarity index 97% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index 6b6826e45..cbd7b1063 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -1,50 +1,50 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -315,12 +315,16 @@ public class SeqGraphUnitTest extends BaseTest { public void testMerging(final SeqGraph graph, final SeqGraph expected) { final SeqGraph merged = (SeqGraph)graph.clone(); merged.simplifyGraph(1); -// if ( ! SeqGraph.graphEquals(merged, expected) ) { -// graph.printGraph(new File("graph.dot"), 0); -// merged.printGraph(new File("merged.dot"), 0); -// expected.printGraph(new File("expected.dot"), 0); -// } - Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); + try { + Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); + } catch (AssertionError e) { +// if ( ! SeqGraph.graphEquals(merged, expected) ) { +// graph.printGraph(new File("graph.dot"), 0); +// merged.printGraph(new File("merged.dot"), 0); +// expected.printGraph(new File("expected.dot"), 0); +// } + throw e; + } } // A -> ACT -> C [non-ref] diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java index ca38351cc..eab9dfc27 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -52,7 +52,6 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; public class SeqVertexUnitTest extends BaseTest { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java similarity index 98% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index 52ab36064..77857c367 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; @@ -98,10 +98,10 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { min = Math.min(min, s.length()); } - final int actualPrefixLen = SharedVertexSequenceSplitter.compPrefixLen(bytes, min); + final int actualPrefixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compPrefixLen(bytes, min); Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test"); - final int actualSuffixLen = SharedVertexSequenceSplitter.compSuffixLen(bytes, min - actualPrefixLen); + final int actualSuffixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compSuffixLen(bytes, min - actualPrefixLen); Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test"); }