diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java new file mode 100644 index 000000000..063e3b218 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java @@ -0,0 +1,142 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.LinkedList; +import java.util.List; +import java.util.TreeSet; + +/** + * Trim down an active region based on a set of variants found across the haplotypes within the region + * + * User: depristo + * Date: 4/27/13 + * Time: 2:10 PM + */ +class ActiveRegionTrimmer { + private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class); + private final boolean logTrimming; + private final int snpPadding, nonSnpPadding, maxDistanceInExtensionForGenotyping; + private final GenomeLocParser parser; + + /** + * Create a new ActiveRegionTrimmer + * + * @param logTrimming should we log our trimming events? + * @param snpPadding how much bp context should we ensure around snps? + * @param nonSnpPadding how much bp context should we ensure around anything not a snp? + * @param maxDistanceInExtensionForGenotyping the max extent we are will to go into the extended region of the + * origin active region in order to properly genotype events in the + * non-extended active region? + * @param parser a genome loc parser so we can create genome locs + */ + ActiveRegionTrimmer(boolean logTrimming, int snpPadding, int nonSnpPadding, int maxDistanceInExtensionForGenotyping, GenomeLocParser parser) { + if ( snpPadding < 0 ) throw new IllegalArgumentException("snpPadding must be >= 0 but got " + snpPadding); + if ( nonSnpPadding < 0 ) throw new IllegalArgumentException("nonSnpPadding must be >= 0 but got " + nonSnpPadding); + if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping); + if ( parser == null ) throw new IllegalArgumentException("parser cannot be null"); + + this.logTrimming = logTrimming; + this.snpPadding = snpPadding; + this.nonSnpPadding = nonSnpPadding; + this.maxDistanceInExtensionForGenotyping = maxDistanceInExtensionForGenotyping; + this.parser = parser; + } + + /** + * Trim down the active region to a region large enough to properly genotype the events found within the active + * region span, excluding all variants that only occur within its extended span. + * + * This function merely creates the region, but it doesn't populate the reads back into the region. + * + * @param region our full active region + * @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position + * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully + */ + public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion) { + if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region + return null; + + final List withinActiveRegion = new LinkedList(); + int pad = snpPadding; + GenomeLoc trimLoc = null; + for ( final VariantContext vc : allVariantsWithinExtendedRegion ) { + final GenomeLoc vcLoc = parser.createGenomeLoc(vc); + if ( region.getLocation().overlapsP(vcLoc) ) { + if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding + pad = nonSnpPadding; + trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); + withinActiveRegion.add(vc); + } + } + + // we don't actually have anything in the region after removing variants that don't overlap the region's full location + if ( trimLoc == null ) return null; + + final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping); + final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad); + final GenomeLoc finalSpan = maxSpan.intersect(idealSpan); + + final ActiveRegion trimmedRegion = region.trim(finalSpan); + if ( logTrimming ) { + logger.info("events : " + withinActiveRegion); + logger.info("trimLoc : " + trimLoc); + logger.info("pad : " + pad); + logger.info("idealSpan : " + idealSpan); + logger.info("maxSpan : " + maxSpan); + logger.info("finalSpan : " + finalSpan); + logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size()); + } + return trimmedRegion; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 5a5946183..48972dfd5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -46,101 +46,53 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; -import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; /** - * Created by IntelliJ IDEA. + * DeBruijn assembler for the HaplotypeCaller + * * User: ebanks, rpoplin * Date: Mar 14, 2011 */ - public class DeBruijnAssembler extends LocalAssemblyEngine { private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class); - private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers - // TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases - private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25; + private final static int NUM_PATHS_PER_GRAPH = 25; + private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers private static final int GRAPH_KMER_STEP = 6; - private final boolean debug; - private final boolean debugGraphTransformations; private final int minKmer; - private final boolean allowCyclesInKmerGraphToGeneratePaths; - private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; - protected DeBruijnAssembler() { - this(false, -1, 11, false); + this(25, -1); } - public DeBruijnAssembler(final boolean debug, - final int debugGraphTransformations, - final int minKmer, - final boolean allowCyclesInKmerGraphToGeneratePaths) { - super(); - this.debug = debug; - this.debugGraphTransformations = debugGraphTransformations > 0; - this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; + public DeBruijnAssembler(final int minKmer, final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) { + super(NUM_PATHS_PER_GRAPH); this.minKmer = minKmer; - this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths; + this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; } - /** - * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads - * @param activeRegion ActiveRegion object holding the reads which are to be used during assembly - * @param refHaplotype reference haplotype object - * @param fullReferenceWithPadding byte array holding the reference sequence with padding - * @param refLoc GenomeLoc object corresponding to the reference sequence with padding - * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode - * @return a non-empty list of all the haplotypes that are produced during assembly - */ - @Ensures({"result.contains(refHaplotype)"}) - public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype ) { - if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } - if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); } - if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); } - if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } - - // create the graphs - final List graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); - - // print the graphs if the appropriate debug option has been turned on - if( graphWriter != null ) { - printGraphs(graphs); - } - - // find the best paths in the graphs and return them as haplotypes - return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); - } - - @Requires({"reads != null", "refHaplotype != null"}) - protected List createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { + @Override + protected List assemble(final List reads, final Haplotype refHaplotype) { final List graphs = new LinkedList(); final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; @@ -165,10 +117,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { " future subsystem will actually go and error correct the reads"); } - final SeqGraph seqGraph = toSeqGraph(graph); + final SeqGraph seqGraph = cleanupSeqGraph(graph.convertToSequenceGraph()); if ( seqGraph != null ) { // if the graph contains interesting variation from the reference - sanityCheckReferenceGraph(seqGraph, refHaplotype); graphs.add(seqGraph); if ( debugGraphTransformations ) // we only want to use one graph size @@ -181,69 +132,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return graphs; } - private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) { - final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); - - // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm - // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect - // TODO -- to anything from one that's actually has good support along the chain but just happens - // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately - // TODO -- the pruning algorithm really should be an error correction algorithm that knows more - // TODO -- about the structure of the data and can differentiate between an infrequent path but - // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through) - // TODO -- from a error with lots of weight going along another similar path - // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive - seqGraph.zipLinearChains(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); - - // now go through and prune the graph, removing vertices no longer connected to the reference chain - // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight - // edges to maintain graph connectivity. - seqGraph.pruneGraph(pruneFactor); - seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); - - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor); - seqGraph.simplifyGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor); - - // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can - // happen in cases where for example the reference somehow manages to acquire a cycle, or - // where the entire assembly collapses back into the reference sequence. - if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null ) - return null; - - seqGraph.removePathsNotConnectedToRef(); - seqGraph.simplifyGraph(); - if ( seqGraph.vertexSet().size() == 1 ) { - // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop - // the code from blowing up. - // TODO -- ref properties should really be on the vertices, not the graph itself - final SeqVertex complete = seqGraph.vertexSet().iterator().next(); - final SeqVertex dummy = new SeqVertex(""); - seqGraph.addVertex(dummy); - seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); - } - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); - - return seqGraph; - } - - protected void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) { - if( graph.getReferenceSourceVertex() == null ) { - throw new IllegalStateException("All reference graphs must have a reference source vertex."); - } - if( graph.getReferenceSinkVertex() == null ) { - throw new IllegalStateException("All reference graphs must have a reference sink vertex."); - } - if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) { - throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path." + - " graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) + - " haplotype = " + new String(refHaplotype.getBases()) - ); - } - } - @Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"}) protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype ) { final DeBruijnGraph graph = new DeBruijnGraph(kmerLength); @@ -344,290 +232,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return true; } - protected void printGraphs(final List graphs) { - final int writeFirstGraphWithSizeSmallerThan = 50; - - graphWriter.println("digraph assemblyGraphs {"); - for( final SeqGraph graph : graphs ) { - if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { - logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); - continue; - } - - graph.printGraph(graphWriter, false, pruneFactor); - - if ( debugGraphTransformations ) - break; - } - - graphWriter.println("}"); - } - - @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"}) - @Ensures({"result.contains(refHaplotype)"}) - private List findBestPaths( final List graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { - - // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes - // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm - final List returnHaplotypes = new ArrayList(); - refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart()); - final Cigar c = new Cigar(); - c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); - refHaplotype.setCigar(c); - returnHaplotypes.add( refHaplotype ); - - final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); - final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength(); - - // for GGA mode, add the desired allele into the haplotype - for( final VariantContext compVC : activeAllelesToGenotype ) { - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); - addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true ); - } - } - - for( final SeqGraph graph : graphs ) { - final SeqVertex source = graph.getReferenceSourceVertex(); - final SeqVertex sink = graph.getReferenceSinkVertex(); - if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph); - - final KBestPaths pathFinder = new KBestPaths(allowCyclesInKmerGraphToGeneratePaths); - for ( final Path path : pathFinder.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH, source, sink) ) { -// logger.info("Found path " + path); - Haplotype h = new Haplotype( path.getBases() ); - if( !returnHaplotypes.contains(h) ) { - final Cigar cigar = path.calculateCigar(); - if( cigar.isEmpty() ) { - throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength()); - } else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < 60 ) { // N cigar elements means that a bubble was too divergent from the reference so skip over this path - continue; - } else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure - throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength()); - } - h.setCigar(cigar); - - // extend partial haplotypes which are anchored in the reference to include the full active region - h = extendPartialHaplotype(h, activeRegionStart, refWithPadding); - final Cigar leftAlignedCigar = leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(h.getCigar()), refWithPadding, h.getBases(), activeRegionStart, 0); - if( !returnHaplotypes.contains(h) ) { - h.setAlignmentStartHapwrtRef(activeRegionStart); - h.setCigar(leftAlignedCigar); - h.setScore(path.getScore()); - returnHaplotypes.add(h); - - if ( debug ) - logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize()); - - // for GGA mode, add the desired allele into the haplotype if it isn't already present - if( !activeAllelesToGenotype.isEmpty() ) { - final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place - for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present - final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); - - // This if statement used to additionally have: - // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)" - // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto - // a haplotype that already contains a 1bp insertion (so practically it is reference but - // falls into the bin for the 1bp deletion because we keep track of the artificial alleles). - if( vcOnHaplotype == null ) { - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ); - } - } - } - } - } - } - } - } - - // add genome locs to the haplotypes - for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow); - - if ( returnHaplotypes.size() < returnHaplotypes.size() ) - logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); - - if( debug ) { - if( returnHaplotypes.size() > 1 ) { - logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); - } else { - logger.info("Found only the reference haplotype in the assembly graph."); - } - for( final Haplotype h : returnHaplotypes ) { - logger.info( h.toString() ); - logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); - } - } - - return returnHaplotypes; - } - - /** - * Extend partial haplotypes which are anchored in the reference to include the full active region - * @param haplotype the haplotype to extend - * @param activeRegionStart the place where the active region starts in the ref byte array - * @param refWithPadding the full reference byte array with padding which encompasses the active region - * @return a haplotype fully extended to encompass the active region - */ - @Requires({"haplotype != null", "activeRegionStart >= 0", "refWithPadding != null", "refWithPadding.length > 0"}) - @Ensures({"result != null", "result.getCigar() != null"}) - private Haplotype extendPartialHaplotype( final Haplotype haplotype, final int activeRegionStart, final byte[] refWithPadding ) { - final Cigar cigar = haplotype.getCigar(); - final Cigar newCigar = new Cigar(); - byte[] newHaplotypeBases = haplotype.getBases(); - int refPos = activeRegionStart; - int hapPos = 0; - for( int iii = 0; iii < cigar.getCigarElements().size(); iii++ ) { - final CigarElement ce = cigar.getCigarElement(iii); - switch (ce.getOperator()) { - case M: - refPos += ce.getLength(); - hapPos += ce.getLength(); - newCigar.add(ce); - break; - case I: - hapPos += ce.getLength(); - newCigar.add(ce); - break; - case D: - if( iii == 0 || iii == cigar.getCigarElements().size() - 1 ) { - newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos), - ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()), - Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length))); - hapPos += ce.getLength(); - refPos += ce.getLength(); - newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M)); - } else { - refPos += ce.getLength(); - newCigar.add(ce); - } - break; - default: - throw new IllegalStateException("Unsupported cigar operator detected: " + ce.getOperator()); - } - } - final Haplotype returnHaplotype = new Haplotype(newHaplotypeBases, haplotype.isReference()); - returnHaplotype.setCigar( newCigar ); - return returnHaplotype; - } - - /** - * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal - * @param c the cigar to test - * @return true if we should skip over this path - */ - @Requires("c != null") - private boolean pathIsTooDivergentFromReference( final Cigar c ) { - for( final CigarElement ce : c.getCigarElements() ) { - if( ce.getOperator().equals(CigarOperator.N) ) { - return true; - } - } - return false; - } - - /** - * Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them. - * This is a target of future work to incorporate and generalize into AlignmentUtils for use by others. - * @param cigar the cigar to left align - * @param refSeq the reference byte array - * @param readSeq the read byte array - * @param refIndex 0-based alignment start position on ref - * @param readIndex 0-based alignment start position on read - * @return the left-aligned cigar - */ - @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) - protected Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { - final Cigar cigarToReturn = new Cigar(); - Cigar cigarToAlign = new Cigar(); - for (int i = 0; i < cigar.numCigarElements(); i++) { - final CigarElement ce = cigar.getCigarElement(i); - if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { - cigarToAlign.add(ce); - final Cigar leftAligned = AlignmentUtils.leftAlignSingleIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false); - for ( final CigarElement toAdd : leftAligned.getCigarElements() ) { cigarToReturn.add(toAdd); } - refIndex += cigarToAlign.getReferenceLength(); - readIndex += cigarToAlign.getReadLength(); - cigarToAlign = new Cigar(); - } else { - cigarToAlign.add(ce); - } - } - if( !cigarToAlign.isEmpty() ) { - for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) { - cigarToReturn.add(toAdd); - } - } - - final Cigar result = AlignmentUtils.consolidateCigar(cigarToReturn); - if( result.getReferenceLength() != cigar.getReferenceLength() ) - throw new IllegalStateException("leftAlignCigarSequentially failed to produce a valid CIGAR. Reference lengths differ. Initial cigar " + cigar + " left aligned into " + result); - return result; - } - - /** - * Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype. - * Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information. - * This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based. - * @param haplotype the candidate haplotype - * @param ref the reference bases to align against - * @param haplotypeList the current list of haplotypes - * @param activeRegionStart the start of the active region in the reference byte array - * @param activeRegionStop the stop of the active region in the reference byte array - * @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists - * @return true if the candidate haplotype was successfully incorporated into the haplotype list - */ - @Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"}) - private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final List haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { - if( haplotype == null ) { return false; } - - final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS ); - haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() ); - - if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments - return false; - } - - haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) ); - - final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true); - int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true ); - if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) { - hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal - } - byte[] newHaplotypeBases; - // extend partial haplotypes to contain the full active region sequence - if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { - newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), - haplotype.getBases()), - ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) ); - } else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { - newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) ); - } else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { - newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) ); - } else { - newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop); - } - - final Haplotype h = new Haplotype( newHaplotypeBases ); - final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS ); - - h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); - if ( haplotype.isArtificialHaplotype() ) { - h.setArtificialEvent(haplotype.getArtificialEvent()); - } - if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments - return false; - } - - h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) ); - - if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) { - haplotypeList.add(h); - return true; - } else { - return false; - } + @Override + public String toString() { + return "DeBruijnAssembler{" + + "minKmer=" + minKmer + + '}'; } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 6ea543f25..33d1104bc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -68,6 +68,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; @@ -135,10 +136,14 @@ import java.util.*; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.LOCUS) @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) -@ActiveRegionTraversalParameters(extension=200, maxRegion=300) +@ActiveRegionTraversalParameters(extension=100, maxRegion=300) @ReadFilters({HCMappingQualityFilter.class}) @Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { + // ----------------------------------------------------------------------------------------------- + // general haplotype caller arguments + // ----------------------------------------------------------------------------------------------- + /** * A raw, unfiltered, highly sensitive callset in VCF format. */ @@ -185,64 +190,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; - /** - * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. - */ - @Advanced - @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) - public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; - - @Hidden - @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) - protected String keepRG = null; - - @Advanced - @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) - protected int MIN_PRUNE_FACTOR = 0; - - @Advanced - @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) - protected int gcpHMM = 10; - - @Advanced - @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) - protected int maxNumHaplotypesInPopulation = 25; - - @Advanced - @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) - protected int minKmer = 11; - - /** - * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling - * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the - * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking - * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, - * and may make use of them in assembly and calling, where possible. - */ - @Hidden - @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) - protected boolean includeUnmappedReads = false; - - @Advanced - @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) - protected boolean USE_ALLELES_TRIGGER = false; - - @Advanced - @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false) - protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; - - @Hidden - @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) - protected boolean justDetermineActiveRegions = false; - - @Hidden - @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) - protected boolean dontGenotype = false; - - @Hidden - @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) - protected boolean errorCorrectKmers = false; - /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. * dbSNP is not used in any way for the calculations themselves. @@ -282,10 +229,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); - @Advanced - @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) - protected boolean mergeVariantsViaLD = false; - /** * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. */ @@ -295,13 +238,139 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @ArgumentCollection private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); + // ----------------------------------------------------------------------------------------------- + // arguments to control internal behavior of the debruijn assembler + // ----------------------------------------------------------------------------------------------- + + @Advanced + @Argument(fullName="useDebruijnAssembler", shortName="useDebruijnAssembler", doc="If specified, we will use the old DeBruijn assembler. Depreciated as of 2.6", required = false) + protected boolean useDebruijnAssembler = false; + + @Advanced + @Argument(fullName="minKmerForDebruijnAssembler", shortName="minKmerForDebruijnAssembler", doc="Minimum kmer length to use in the debruijn assembly graph", required = false) + protected int minKmerForDebruijnAssembler = 11; + + @Advanced + @Argument(fullName="onlyUseKmerSizeForDebruijnAssembler", shortName="onlyUseKmerSizeForDebruijnAssembler", doc="If specified, we will only build kmer graphs with this kmer size in the debruijn", required = false) + protected int onlyUseKmerSizeForDebruijnAssembler = -1; + + // ----------------------------------------------------------------------------------------------- + // arguments to control internal behavior of the read threading assembler + // ----------------------------------------------------------------------------------------------- + + @Advanced + @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false) + protected List kmerSizes = Arrays.asList(10, 25); + + /** + * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype + * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the + * run of the haplotype caller we only take maxPathsPerSample * nSample paths from the graph, in order of their + * weights, no matter how many paths are possible to generate from the graph. Putting this number too low + * will result in dropping true variation because paths that include the real variant are not even considered. + */ + @Advanced + @Argument(fullName="maxPathsPerSample", shortName="maxPathsPerSample", doc="Max number of paths to consider for the read threading assembler per sample.", required = false) + protected int maxPathsPerSample = 10; + + /** + * The minimum number of paths to advance forward for genotyping, regardless of the + * number of samples + */ + private final static int MIN_PATHS_PER_GRAPH = 128; + + @Hidden + @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) + protected boolean dontRecoverDanglingTails = false; + + // ----------------------------------------------------------------------------------------------- + // general advanced arguments to control haplotype caller behavior + // ----------------------------------------------------------------------------------------------- + + @Advanced + @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) + protected int MIN_PRUNE_FACTOR = 2; + + @Advanced + @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) + protected int gcpHMM = 10; + + /** + * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling + * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the + * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking + * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, + * and may make use of them in assembly and calling, where possible. + */ + @Hidden + @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) + protected boolean includeUnmappedReads = false; + + @Advanced + @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) + protected boolean USE_ALLELES_TRIGGER = false; + + @Advanced + @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false) + protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; + + /** + * The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their + * mapping quality. This term effects the probability that a read originated from the reference haplotype, regardless of + * its edit distance from the reference, in that the read could have originated from the reference haplotype but + * from another location in the genome. Suppose a read has many mismatches from the reference, say like 5, but + * has a very high mapping quality of 60. Without this parameter, the read would contribute 5 * Q30 evidence + * in favor of its 5 mismatch haplotype compared to reference, potentially enough to make a call off that single + * read for all of these events. With this parameter set to Q30, though, the maximum evidence against the reference + * that this (and any) read could contribute against reference is Q30. + * + * Set this term to any negative number to turn off the global mapping rate + */ + @Advanced + @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) + protected int phredScaledGlobalReadMismappingRate = 60; + + @Advanced + @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) + protected int maxNumHaplotypesInPopulation = 25; + + @Advanced + @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) + protected boolean mergeVariantsViaLD = false; + + // ----------------------------------------------------------------------------------------------- + // arguments for debugging / developing the haplotype caller + // ----------------------------------------------------------------------------------------------- + /** + * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Hidden + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + + @Hidden + @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) + protected String keepRG = null; + + @Hidden + @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) + protected boolean justDetermineActiveRegions = false; + + @Hidden + @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) + protected boolean dontGenotype = false; + + @Hidden + @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectKmers = false; + @Advanced @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; - @Advanced + @Hidden @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) - protected int debugGraphTransformations = -1; + protected boolean debugGraphTransformations = false; @Hidden // TODO -- not currently useful @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) @@ -311,10 +380,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) protected boolean dontTrimActiveRegions = false; + @Hidden + @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) + protected boolean dontUseSoftClippedBases = false; + @Hidden @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) protected boolean allowCyclesInKmerGraphToGeneratePaths = false; + // ----------------------------------------------------------------------------------------------- + // done with Haplotype caller parameters + // ----------------------------------------------------------------------------------------------- // the UG engines private UnifiedGenotyperEngine UG_engine = null; @@ -344,12 +420,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // the maximum extent into the full active region extension that we're willing to go in genotyping our events private final static int MAX_GENOTYPING_ACTIVE_REGION_EXTENSION = 25; + private ActiveRegionTrimmer trimmer = null; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument // bases with quality less than or equal to this value are trimmed off the tails of the reads private static final byte MIN_TAIL_QUALITY = 20; + // the minimum length of a read we'd consider using for genotyping + private final static int MIN_READ_LENGTH = 10; + private List samplesList = new ArrayList(); private final static double LOG_ONE_HALF = -Math.log10(2.0); private final static double LOG_ONE_THIRD = -Math.log10(3.0); @@ -373,6 +454,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // get all of the unique sample names Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); samplesList.addAll( samples ); + final int nSamples = samples.size(); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); @@ -428,14 +510,36 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - // setup the assembler - assemblyEngine = new DeBruijnAssembler(DEBUG, debugGraphTransformations, minKmer, allowCyclesInKmerGraphToGeneratePaths); + // create and setup the assembler + final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH); + assemblyEngine = useDebruijnAssembler + ? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler) + : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes); + assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); + assemblyEngine.setDebug(DEBUG); + assemblyEngine.setDebugGraphTransformations(debugGraphTransformations); + assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths); + assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails); + if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); - likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); + // setup the likelihood calculation engine + if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1; + + // configure the global mismapping rate + final double log10GlobalReadMismappingRate; + if ( phredScaledGlobalReadMismappingRate < 0 ) { + log10GlobalReadMismappingRate = - Double.MAX_VALUE; + } else { + log10GlobalReadMismappingRate = QualityUtils.qualToErrorProbLog10(phredScaledGlobalReadMismappingRate); + logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units"); + } + + // create our likelihood calculation engine + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate ); final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); @@ -443,6 +547,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if ( bamWriter != null ) haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); + + trimmer = new ActiveRegionTrimmer(DEBUG, PADDING_AROUND_SNPS_FOR_CALLING, PADDING_AROUND_OTHERS_FOR_CALLING, + MAX_GENOTYPING_ACTIVE_REGION_EXTENSION, getToolkit().getGenomeLocParser()); } //--------------------------------------------------------------------------------------------------------------- @@ -564,7 +671,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); // abort early if something is out of the acceptable range - if( assemblyResult.haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! + if( ! assemblyResult.isVariationPresent() ) { return 1; } // only the reference haplotype remains so nothing else to do! if (dontGenotype) return 1; // user requested we not proceed // filter out reads from genotyping which fail mapping quality based criteria @@ -613,12 +720,18 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final ActiveRegion regionForGenotyping; final byte[] fullReferenceWithPadding; final GenomeLoc paddedReferenceLoc; + final boolean variationPresent; - private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc) { + private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) { this.haplotypes = haplotypes; this.regionForGenotyping = regionForGenotyping; this.fullReferenceWithPadding = fullReferenceWithPadding; this.paddedReferenceLoc = paddedReferenceLoc; + this.variationPresent = variationPresent; + } + + public boolean isVariationPresent() { + return variationPresent && haplotypes.size() > 1; } } @@ -644,63 +757,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if ( ! dontTrimActiveRegions ) { return trimActiveRegion(activeRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc); } else { - // we don't want to or cannot create a trimmed active region, so go ahead and use the old one - return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc); + // we don't want to trim active regions, so go ahead and use the old one + return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true); } } - /** - * Trim down the active region to just enough to properly genotype the events among the haplotypes - * - * This function merely creates the region, but it doesn't populate the reads back into the region - * - * @param region our full active region - * @param haplotypes the list of haplotypes we've created from assembly - * @param ref the reference bases over the full padded location - * @param refLoc the span of the reference bases - * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully - */ - private ActiveRegion createTrimmedRegion(final ActiveRegion region, final List haplotypes, final byte[] ref, final GenomeLoc refLoc) { - EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); - final TreeSet allContexts = EventMap.getAllVariantContexts(haplotypes); - final GenomeLocParser parser = getToolkit().getGenomeLocParser(); - - if ( allContexts.isEmpty() ) // no variants, so just return the current region - return null; - - final List withinActiveRegion = new LinkedList(); - int pad = PADDING_AROUND_SNPS_FOR_CALLING; - GenomeLoc trimLoc = null; - for ( final VariantContext vc : allContexts ) { - final GenomeLoc vcLoc = parser.createGenomeLoc(vc); - if ( region.getLocation().overlapsP(vcLoc) ) { - if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding - pad = PADDING_AROUND_OTHERS_FOR_CALLING; - trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); - withinActiveRegion.add(vc); - } - } - - // we don't actually have anything in the region after removing variants that don't overlap the region's full location - if ( trimLoc == null ) return null; - - final GenomeLoc maxSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(region.getLocation(), MAX_GENOTYPING_ACTIVE_REGION_EXTENSION); - final GenomeLoc idealSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(trimLoc, pad); - final GenomeLoc finalSpan = maxSpan.intersect(idealSpan); - - final ActiveRegion trimmedRegion = region.trim(finalSpan); - if ( DEBUG ) { - logger.info("events : " + withinActiveRegion); - logger.info("trimLoc : " + trimLoc); - logger.info("pad : " + pad); - logger.info("idealSpan : " + idealSpan); - logger.info("maxSpan : " + maxSpan); - logger.info("finalSpan : " + finalSpan); - logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size()); - } - return trimmedRegion; - } - /** * Trim down the active region to just enough to properly genotype the events among the haplotypes * @@ -709,17 +770,24 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * @param fullReferenceWithPadding the reference bases over the full padded location * @param paddedReferenceLoc the span of the reference bases * @return an AssemblyResult containing the trimmed active region with all of the reads we should use - * trimmed down as well, and a revised set of haplotypes. If trimming failed this function - * may choose to use the originalActiveRegion without modification + * trimmed down as well, and a revised set of haplotypes. If trimming down the active region results + * in only the reference haplotype over the non-extended active region, returns null. */ private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion, final List haplotypes, final byte[] fullReferenceWithPadding, final GenomeLoc paddedReferenceLoc) { - final ActiveRegion trimmedActiveRegion = createTrimmedRegion(originalActiveRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc); + if ( DEBUG ) logger.info("Trimming active region " + originalActiveRegion + " with " + haplotypes.size() + " haplotypes"); - if ( trimmedActiveRegion == null ) - return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc); + EventMap.buildEventMapsForHaplotypes(haplotypes, fullReferenceWithPadding, paddedReferenceLoc, DEBUG); + final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypes); + final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion); + + if ( trimmedActiveRegion == null ) { + // there were no variants found within the active region itself, so just return null + if ( DEBUG ) logger.info("No variation found within the active region, skipping the region :-)"); + return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, false); + } // trim down the haplotypes final Set haplotypeSet = new HashSet(haplotypes.size()); @@ -738,8 +806,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() ); + if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); if ( DEBUG ) { - logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); for ( final Haplotype remaining: trimmedHaplotypes ) { logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar()); } @@ -757,7 +825,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem trimmedActiveRegion.clearReads(); trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads)); - return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc); + return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true); } /** @@ -821,15 +889,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { GATKSAMRecord clippedRead = useLowQualityBasesForAssembly ? postAdapterRead : ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); - // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches - // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't - // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion - // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the - // TODO -- reference haplotype start must be removed - clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); - - // uncomment to remove hard clips from consideration at all - //clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); + if ( dontUseSoftClippedBases ) { + // uncomment to remove hard clips from consideration at all + clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); + } else { + // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches + // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't + // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion + // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the + // TODO -- reference haplotype start must be removed + clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); + } clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { @@ -843,13 +913,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { - final List readsToRemove = new ArrayList(); + final List readsToRemove = new ArrayList<>(); +// logger.info("Filtering non-passing regions: n incoming " + activeRegion.getReads().size()); for( final GATKSAMRecord rec : activeRegion.getReads() ) { - if( rec.getReadLength() < 10 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { + if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { readsToRemove.add(rec); +// logger.info("\tremoving read " + rec + " len " + rec.getReadLength()); } } activeRegion.removeAll( readsToRemove ); +// logger.info("Filtered non-passing regions: n remaining " + activeRegion.getReads().size()); return readsToRemove; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java index a7194f85f..aad8407dd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java @@ -46,9 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; +import java.util.*; /** * generic utility class that counts kmers @@ -97,6 +95,20 @@ public class KMerCounter { return countsByKMer.values(); } + /** + * Get kmers that have minCount or greater in this counter + * @param minCount only return kmers with count >= this value + * @return a non-null collection of kmers + */ + public Collection getKmersWithCountsAtLeast(final int minCount) { + final List result = new LinkedList(); + for ( final CountedKmer countedKmer : getCountedKmers() ) { + if ( countedKmer.count >= minCount ) + result.add(countedKmer.kmer); + } + return result; + } + /** * Remove all current counts, resetting the counter to an empty state */ diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java index 9b0e1ac0a..745d4de06 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java @@ -149,6 +149,14 @@ public class Kmer { return bases; } + /** + * Get a string representation of the bases of this kmer + * @return a non-null string + */ + public String baseString() { + return new String(bases()); + } + /** * The length of this kmer * @return an integer >= 0 diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 8697833a6..fbd9b29d5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -69,19 +69,33 @@ public class LikelihoodCalculationEngine { private static final double LOG_ONE_HALF = -Math.log10(2.0); private final byte constantGCP; + private final double log10globalReadMismappingRate; private final boolean DEBUG; private final PairHMM pairHMM; - private final int minReadLength = 20; /** * The expected rate of random sequencing errors for a read originating from its true haplotype. * * For example, if this is 0.01, then we'd expect 1 error per 100 bp. */ - private final double EXPECTED_ERROR_RATE_PER_BASE = 0.02; - - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) { + private final static double EXPECTED_ERROR_RATE_PER_BASE = 0.02; + /** + * Create a new LikelihoodCalculationEngine using provided parameters and hmm to do its calculations + * + * @param constantGCP the gap continuation penalty to use with the PairHMM + * @param debug should we emit debugging information during the calculation? + * @param hmmType the type of the HMM to use + * @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of + * -3 means that the chance that a read doesn't actually belong at this + * location in the genome is 1 in 1000. The effect of this parameter is + * to cap the maximum likelihood difference between the reference haplotype + * and the best alternative haplotype by -3 log units. So if the best + * haplotype is at -10 and this parameter has a value of -3 then even if the + * reference haplotype gets a score of -100 from the pairhmm it will be + * assigned a likelihood of -13. + */ + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate ) { switch (hmmType) { case EXACT: pairHMM = new Log10PairHMM(true); @@ -98,6 +112,11 @@ public class LikelihoodCalculationEngine { this.constantGCP = constantGCP; DEBUG = debug; + this.log10globalReadMismappingRate = log10globalReadMismappingRate; + } + + public LikelihoodCalculationEngine() { + this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3); } /** @@ -134,7 +153,6 @@ public class LikelihoodCalculationEngine { // Add likelihoods for each sample's reads to our stratifiedReadMap final Map stratifiedReadMap = new HashMap(); for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { - //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); @@ -152,17 +170,16 @@ public class LikelihoodCalculationEngine { private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time) final int numHaplotypes = haplotypes.size(); - final Map alleleVersions = new HashMap(numHaplotypes); + final Map alleleVersions = new HashMap<>(numHaplotypes); + Allele refAllele = null; for ( final Haplotype haplotype : haplotypes ) { - alleleVersions.put(haplotype, Allele.create(haplotype, true)); + final Allele allele = Allele.create(haplotype, true); + alleleVersions.put(haplotype, allele); + if ( haplotype.isReference() ) refAllele = allele; } final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); for( final GATKSAMRecord read : reads ) { - if ( read.getReadLength() < minReadLength ) - // don't consider any reads that have a read length < the minimum - continue; - final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read @@ -177,14 +194,34 @@ public class LikelihoodCalculationEngine { readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); } + // keep track of the reference likelihood and the best non-ref likelihood + double refLog10l = Double.NEGATIVE_INFINITY; + double bestNonReflog10L = Double.NEGATIVE_INFINITY; + + // iterate over all haplotypes, calculating the likelihood of the read for each haplotype for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { final Haplotype haplotype = haplotypes.get(jjj); final boolean isFirstHaplotype = jjj == 0; final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype); + if ( haplotype.isNonReference() ) + bestNonReflog10L = Math.max(bestNonReflog10L, log10l); + else + refLog10l = log10l; + perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); } + + // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global + // mismapping rate. This protects us from the case where the assembly has produced haplotypes + // that are very divergent from reference, but are supported by only one read. In effect + // we capping how badly scoring the reference can be for any read by the chance that the read + // itself just doesn't belong here + final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; + if ( refLog10l < (worstRefLog10Allowed) ) { + perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed); + } } return perReadAlleleLikelihoodMap; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 4c0483ad6..20b005b40 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -46,28 +46,388 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet; +import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; +import java.io.File; import java.io.PrintStream; -import java.util.List; +import java.util.*; /** - * Created by IntelliJ IDEA. + * Abstract base class for all HaplotypeCaller assemblers + * * User: ebanks * Date: Mar 14, 2011 */ public abstract class LocalAssemblyEngine { - public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8; + private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class); + + public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8; + private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30; + + protected final int numBestHaplotypesPerGraph; + + protected boolean debug = false; + protected boolean allowCyclesInKmerGraphToGeneratePaths = false; + protected boolean debugGraphTransformations = false; + protected boolean recoverDanglingTails = true; - protected PrintStream graphWriter = null; protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE; protected int pruneFactor = 2; protected boolean errorCorrectKmers = false; - protected LocalAssemblyEngine() { } + private PrintStream graphWriter = null; + + /** + * Create a new LocalAssemblyEngine with all default parameters, ready for use + * @param numBestHaplotypesPerGraph the number of haplotypes to generate for each assembled graph + */ + protected LocalAssemblyEngine(final int numBestHaplotypesPerGraph) { + if ( numBestHaplotypesPerGraph < 1 ) throw new IllegalArgumentException("numBestHaplotypesPerGraph should be >= 1 but got " + numBestHaplotypesPerGraph); + this.numBestHaplotypesPerGraph = numBestHaplotypesPerGraph; + } + + /** + * Main subclass function: given reads and a reference haplotype give us graphs to use for constructing + * non-reference haplotypes. + * + * @param reads the reads we're going to assemble + * @param refHaplotype the reference haplotype + * @return a non-null list of reads + */ + protected abstract List assemble(List reads, Haplotype refHaplotype); + + /** + * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads + * @param activeRegion ActiveRegion object holding the reads which are to be used during assembly + * @param refHaplotype reference haplotype object + * @param fullReferenceWithPadding byte array holding the reference sequence with padding + * @param refLoc GenomeLoc object corresponding to the reference sequence with padding + * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode + * @return a non-empty list of all the haplotypes that are produced during assembly + */ + public List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List activeAllelesToGenotype) { + if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } + if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); } + if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); } + if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } + + // create the graphs by calling our subclass assemble method + final List graphs = assemble(activeRegion.getReads(), refHaplotype); + + // do some QC on the graphs + for ( final SeqGraph graph : graphs ) { sanityCheckGraph(graph, refHaplotype); } + + // print the graphs if the appropriate debug option has been turned on + if ( graphWriter != null ) { printGraphs(graphs); } + + // find the best paths in the graphs and return them as haplotypes + return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); + } + + @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"}) + @Ensures({"result.contains(refHaplotype)"}) + protected List findBestPaths(final List graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow) { + // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes + final Set returnHaplotypes = new LinkedHashSet(); + refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart()); + final Cigar c = new Cigar(); + c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); + refHaplotype.setCigar(c); + returnHaplotypes.add( refHaplotype ); + + final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); + final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength(); + + // for GGA mode, add the desired allele into the haplotype + for( final VariantContext compVC : activeAllelesToGenotype ) { + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); + addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true ); + } + } + + for( final SeqGraph graph : graphs ) { + final SeqVertex source = graph.getReferenceSourceVertex(); + final SeqVertex sink = graph.getReferenceSinkVertex(); + if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph); + + final KBestPaths pathFinder = new KBestPaths(allowCyclesInKmerGraphToGeneratePaths); + for ( final Path path : pathFinder.getKBestPaths(graph, numBestHaplotypesPerGraph, source, sink) ) { +// logger.info("Found path " + path); + Haplotype h = new Haplotype( path.getBases() ); + if( !returnHaplotypes.contains(h) ) { + final Cigar cigar = path.calculateCigar(refHaplotype.getBases()); + + if ( cigar == null ) { + // couldn't produce a meaningful alignment of haplotype to reference, fail quitely + continue; + } else if( cigar.isEmpty() ) { + throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength()); + } else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < MIN_HAPLOTYPE_REFERENCE_LENGTH ) { + // N cigar elements means that a bubble was too divergent from the reference so skip over this path + continue; + } else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure + throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength() + + " ref = " + refHaplotype + " path " + new String(path.getBases())); + } + + h.setCigar(cigar); + h.setAlignmentStartHapwrtRef(activeRegionStart); + h.setScore(path.getScore()); + returnHaplotypes.add(h); + + if ( debug ) + logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize()); + + // for GGA mode, add the desired allele into the haplotype if it isn't already present + if( !activeAllelesToGenotype.isEmpty() ) { + final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place + for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present + final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); + + // This if statement used to additionally have: + // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)" + // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto + // a haplotype that already contains a 1bp insertion (so practically it is reference but + // falls into the bin for the 1bp deletion because we keep track of the artificial alleles). + if( vcOnHaplotype == null ) { + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ); + } + } + } + } + } + } + } + + // add genome locs to the haplotypes + for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow); + + if ( returnHaplotypes.size() < returnHaplotypes.size() ) + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); + + if( debug ) { + if( returnHaplotypes.size() > 1 ) { + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); + } else { + logger.info("Found only the reference haplotype in the assembly graph."); + } + for( final Haplotype h : returnHaplotypes ) { + logger.info( h.toString() ); + logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() + " ref " + h.isReference()); + } + } + + return new ArrayList(returnHaplotypes); + } + + /** + * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal + * @param c the cigar to test + * @return true if we should skip over this path + */ + @Requires("c != null") + private boolean pathIsTooDivergentFromReference( final Cigar c ) { + for( final CigarElement ce : c.getCigarElements() ) { + if( ce.getOperator().equals(CigarOperator.N) ) { + return true; + } + } + return false; + } + + /** + * Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype. + * Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information. + * This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based. + * @param haplotype the candidate haplotype + * @param ref the reference bases to align against + * @param haplotypeList the current list of haplotypes + * @param activeRegionStart the start of the active region in the reference byte array + * @param activeRegionStop the stop of the active region in the reference byte array + * @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists + * @return true if the candidate haplotype was successfully incorporated into the haplotype list + */ + @Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"}) + private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final Set haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { + if( haplotype == null ) { return false; } + + final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS ); + haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() ); + + if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments + return false; + } + + haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) ); + + final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true); + int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true ); + if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) { + hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal + } + byte[] newHaplotypeBases; + // extend partial haplotypes to contain the full active region sequence + if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + newHaplotypeBases = ArrayUtils.addAll(ArrayUtils.addAll(ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), + haplotype.getBases()), + ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop)); + } else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) ); + } else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) ); + } else { + newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop); + } + + final Haplotype h = new Haplotype( newHaplotypeBases ); + final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS ); + + h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); + if ( haplotype.isArtificialHaplotype() ) { + h.setArtificialEvent(haplotype.getArtificialEvent()); + } + if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments + return false; + } + + h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) ); + + if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) { + haplotypeList.add(h); + return true; + } else { + return false; + } + } + + protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) { + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + + // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm + // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect + // TODO -- to anything from one that's actually has good support along the chain but just happens + // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately + // TODO -- the pruning algorithm really should be an error correction algorithm that knows more + // TODO -- about the structure of the data and can differentiate between an infrequent path but + // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through) + // TODO -- from a error with lots of weight going along another similar path + // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive + seqGraph.zipLinearChains(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); + + // now go through and prune the graph, removing vertices no longer connected to the reference chain + // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight + // edges to maintain graph connectivity. + seqGraph.pruneGraph(pruneFactor); + seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); + + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor); + seqGraph.simplifyGraph(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor); + + // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can + // happen in cases where for example the reference somehow manages to acquire a cycle, or + // where the entire assembly collapses back into the reference sequence. + if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null ) + return null; + + seqGraph.removePathsNotConnectedToRef(); + seqGraph.simplifyGraph(); + if ( seqGraph.vertexSet().size() == 1 ) { + // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop + // the code from blowing up. + // TODO -- ref properties should really be on the vertices, not the graph itself + final SeqVertex complete = seqGraph.vertexSet().iterator().next(); + final SeqVertex dummy = new SeqVertex(""); + seqGraph.addVertex(dummy); + seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); + } + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); + + return seqGraph; + } + + /** + * Perform general QC on the graph to make sure something hasn't gone wrong during assembly + * @param graph the graph to check + * @param refHaplotype the reference haplotype + * @param + */ + private void sanityCheckGraph(final BaseGraph graph, final Haplotype refHaplotype) { + sanityCheckReferenceGraph(graph, refHaplotype); + } + + /** + * Make sure the reference sequence is properly represented in the provided graph + * + * @param graph the graph to check + * @param refHaplotype the reference haplotype + * @param + */ + private void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) { + if( graph.getReferenceSourceVertex() == null ) { + throw new IllegalStateException("All reference graphs must have a reference source vertex."); + } + if( graph.getReferenceSinkVertex() == null ) { + throw new IllegalStateException("All reference graphs must have a reference sink vertex."); + } + if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) { + throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path. for graph " + graph + + " graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) + + " haplotype = " + new String(refHaplotype.getBases()) + ); + } + } + + /** + * Print the generated graphs to the graphWriter + * @param graphs a non-null list of graphs to print out + */ + private void printGraphs(final List graphs) { + final int writeFirstGraphWithSizeSmallerThan = 50; + + graphWriter.println("digraph assemblyGraphs {"); + for( final SeqGraph graph : graphs ) { + if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { + logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); + continue; + } + + graph.printGraph(graphWriter, false, pruneFactor); + + if ( debugGraphTransformations ) + break; + } + + graphWriter.println("}"); + } + + // ----------------------------------------------------------------------------------------------- + // + // getter / setter routines for generic assembler properties + // + // ----------------------------------------------------------------------------------------------- public int getPruneFactor() { return pruneFactor; @@ -85,10 +445,6 @@ public abstract class LocalAssemblyEngine { this.errorCorrectKmers = errorCorrectKmers; } - public PrintStream getGraphWriter() { - return graphWriter; - } - public void setGraphWriter(PrintStream graphWriter) { this.graphWriter = graphWriter; } @@ -101,5 +457,35 @@ public abstract class LocalAssemblyEngine { this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; } - public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List activeAllelesToGenotype); + public boolean isDebug() { + return debug; + } + + public void setDebug(boolean debug) { + this.debug = debug; + } + + public boolean isAllowCyclesInKmerGraphToGeneratePaths() { + return allowCyclesInKmerGraphToGeneratePaths; + } + + public void setAllowCyclesInKmerGraphToGeneratePaths(boolean allowCyclesInKmerGraphToGeneratePaths) { + this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths; + } + + public boolean isDebugGraphTransformations() { + return debugGraphTransformations; + } + + public void setDebugGraphTransformations(boolean debugGraphTransformations) { + this.debugGraphTransformations = debugGraphTransformations; + } + + public boolean isRecoverDanglingTails() { + return recoverDanglingTails; + } + + public void setRecoverDanglingTails(boolean recoverDanglingTails) { + this.recoverDanglingTails = recoverDanglingTails; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java index be5a431c4..a6ef0d1c2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java @@ -76,12 +76,10 @@ public class BaseEdge { } /** - * Copy constructor - * - * @param toCopy + * Create a new copy of this BaseEdge */ - public BaseEdge(final BaseEdge toCopy) { - this(toCopy.isRef(), toCopy.getMultiplicity()); + public BaseEdge copy() { + return new BaseEdge(isRef(), getMultiplicity()); } /** @@ -92,6 +90,34 @@ public class BaseEdge { return multiplicity; } + /** + * Get the DOT format label for this edge, to be displayed when printing this edge to a DOT file + * @return a non-null string + */ + public String getDotLabel() { + return Integer.toString(getMultiplicity()); + } + + /** + * Increase the multiplicity of this edge by incr + * @param incr the change in this multiplicity, must be >= 0 + */ + public void incMultiplicity(final int incr) { + if ( incr < 0 ) throw new IllegalArgumentException("incr must be >= 0 but got " + incr); + multiplicity += incr; + } + + /** + * A special assessor that returns the multiplicity that should be used by pruning algorithm + * + * Can be overloaded by subclasses + * + * @return the multiplicity value that should be used for pruning + */ + public int getPruningMultiplicity() { + return getMultiplicity(); + } + /** * Set the multiplicity of this edge to value * @param value an integer >= 0 @@ -117,23 +143,6 @@ public class BaseEdge { this.isRef = isRef; } - /** - * Does this and edge have the same source and target vertices in graph? - * - * @param graph the graph containing both this and edge - * @param edge our comparator edge - * @param - * @return true if we have the same source and target vertices - */ - public boolean hasSameSourceAndTarget(final BaseGraph graph, final BaseEdge edge) { - return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge))); - } - - // For use when comparing edges across graphs! - public boolean seqEquals( final BaseGraph graph, final BaseEdge edge, final BaseGraph graph2 ) { - return (graph.getEdgeSource(this).seqEquals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).seqEquals(graph2.getEdgeTarget(edge))); - } - /** * Sorts a collection of BaseEdges in decreasing order of weight, so that the most * heavily weighted is at the start of the list @@ -187,4 +196,12 @@ public class BaseEdge { if ( edge == null ) throw new IllegalArgumentException("edge cannot be null"); return new BaseEdge(isRef() || edge.isRef(), Math.max(getMultiplicity(), edge.getMultiplicity())); } + + @Override + public String toString() { + return "BaseEdge{" + + "multiplicity=" + multiplicity + + ", isRef=" + isRef + + '}'; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 7ce57e2e7..8938af7c2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -66,34 +66,16 @@ import java.util.*; * Date: 2/6/13 */ @Invariant("!this.isAllowingMultipleEdges()") -public class BaseGraph extends DefaultDirectedGraph { +public class BaseGraph extends DefaultDirectedGraph { protected final static Logger logger = Logger.getLogger(BaseGraph.class); private final int kmerSize; - /** - * Construct an empty BaseGraph - */ - public BaseGraph() { - this(11); - } - - /** - * Edge factory that creates non-reference multiplicity 1 edges - * @param the new of our vertices - */ - private static class MyEdgeFactory implements EdgeFactory { - @Override - public BaseEdge createEdge(T sourceVertex, T targetVertex) { - return new BaseEdge(false, 1); - } - } - /** * Construct a DeBruijnGraph with kmerSize * @param kmerSize */ - public BaseGraph(final int kmerSize) { - super(new MyEdgeFactory()); + public BaseGraph(final int kmerSize, final EdgeFactory edgeFactory) { + super(edgeFactory); if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); this.kmerSize = kmerSize; @@ -111,7 +93,7 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph getSources() { - final Set set = new LinkedHashSet(); - for ( final T v : vertexSet() ) + public Set getSources() { + final Set set = new LinkedHashSet(); + for ( final V v : vertexSet() ) if ( isSource(v) ) set.add(v); return set; @@ -153,9 +135,9 @@ public class BaseGraph extends DefaultDirectedGraph getSinks() { - final Set set = new LinkedHashSet(); - for ( final T v : vertexSet() ) + public Set getSinks() { + final Set set = new LinkedHashSet(); + for ( final V v : vertexSet() ) if ( isSink(v) ) set.add(v); return set; @@ -167,7 +149,7 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph vertices) { - for ( final T v : vertices ) + public void addVertices(final Collection vertices) { + for ( final V v : vertices ) addVertex(v); } @@ -349,8 +341,12 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph outgoingVerticesOf(final T v) { - final Set s = new LinkedHashSet(); - for ( final BaseEdge e : outgoingEdgesOf(v) ) { + public Set outgoingVerticesOf(final V v) { + final Set s = new LinkedHashSet(); + for ( final E e : outgoingEdgesOf(v) ) { s.add(getEdgeTarget(e)); } return s; @@ -384,9 +380,9 @@ public class BaseGraph extends DefaultDirectedGraph v */ - public Set incomingVerticesOf(final T v) { - final Set s = new LinkedHashSet(); - for ( final BaseEdge e : incomingEdgesOf(v) ) { + public Set incomingVerticesOf(final V v) { + final Set s = new LinkedHashSet(); + for ( final E e : incomingEdgesOf(v) ) { s.add(getEdgeSource(e)); } return s; @@ -413,15 +409,16 @@ public class BaseGraph extends DefaultDirectedGraph " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); + for( final E edge : edgeSet() ) { + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getDotLabel() + "\"];"); if( edge.isRef() ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); } } - for( final T v : vertexSet() ) { - graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + "\",shape=box]"); + for( final V v : vertexSet() ) { +// graphWriter.println("\t" + v.toString() + " [label=\"" + v + "\",shape=box]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + v.additionalInfo() + "\",shape=box]"); } if ( writeHeader ) @@ -439,10 +436,10 @@ public class BaseGraph extends DefaultDirectedGraph edgesToCheck = new HashSet(); + final Set edgesToCheck = new HashSet(); edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex())); while( !edgesToCheck.isEmpty() ) { - final BaseEdge e = edgesToCheck.iterator().next(); + final E e = edgesToCheck.iterator().next(); if( !e.isRef() ) { edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) ); removeEdge(e); @@ -452,7 +449,7 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph edgesToRemove = new ArrayList(); - for( final BaseEdge e : edgeSet() ) { - if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor + final List edgesToRemove = new ArrayList<>(); + for( final E e : edgeSet() ) { + if( e.getPruningMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor edgesToRemove.add(e); } } @@ -480,13 +477,25 @@ public class BaseGraph extends DefaultDirectedGraph pruner = new LowWeightChainPruner<>(pruneFactor); + pruner.pruneLowWeightChains(this); + } + /** * Remove all vertices in the graph that have in and out degree of 0 */ protected void removeSingletonOrphanVertices() { // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new LinkedList(); - for( final T v : vertexSet() ) { + final List verticesToRemove = new LinkedList<>(); + for( final V v : vertexSet() ) { if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { verticesToRemove.add(v); } @@ -499,11 +508,11 @@ public class BaseGraph extends DefaultDirectedGraph toRemove = new HashSet(vertexSet()); + final HashSet toRemove = new HashSet<>(vertexSet()); - final T refV = getReferenceSourceVertex(); + final V refV = getReferenceSourceVertex(); if ( refV != null ) { - for ( final T v : new BaseGraphIterator(this, refV, true, true) ) { + for ( final V v : new BaseGraphIterator<>(this, refV, true, true) ) { toRemove.remove(v); } } @@ -524,22 +533,31 @@ public class BaseGraph extends DefaultDirectedGraph onPathFromRefSource = new HashSet(vertexSet().size()); - for ( final T v : new BaseGraphIterator(this, getReferenceSourceVertex(), false, true) ) { + final Set onPathFromRefSource = new HashSet<>(vertexSet().size()); + for ( final V v : new BaseGraphIterator<>(this, getReferenceSourceVertex(), false, true) ) { onPathFromRefSource.add(v); } // get the set of vertices we can reach by going backward from the ref sink - final Set onPathFromRefSink = new HashSet(vertexSet().size()); - for ( final T v : new BaseGraphIterator(this, getReferenceSinkVertex(), true, false) ) { + final Set onPathFromRefSink = new HashSet<>(vertexSet().size()); + for ( final V v : new BaseGraphIterator<>(this, getReferenceSinkVertex(), true, false) ) { onPathFromRefSink.add(v); } // we want to remove anything that's not in both the sink and source sets - final Set verticesToRemove = new HashSet(vertexSet()); + final Set verticesToRemove = new HashSet<>(vertexSet()); onPathFromRefSource.retainAll(onPathFromRefSink); verticesToRemove.removeAll(onPathFromRefSource); removeAllVertices(verticesToRemove); + + // simple santity checks that this algorithm is working. + if ( getSinks().size() > 1 ) { + throw new IllegalStateException("Should have eliminated all but the reference sink, but found " + getSinks()); + } + + if ( getSources().size() > 1 ) { + throw new IllegalStateException("Should have eliminated all but the reference source, but found " + getSources()); + } } /** @@ -555,11 +573,11 @@ public class BaseGraph extends DefaultDirectedGraph the type of the nodes in those graphs * @return true if g1 and g2 are equals */ - public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) { + public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) { final Set vertices1 = g1.vertexSet(); final Set vertices2 = g2.vertexSet(); - final Set edges1 = g1.edgeSet(); - final Set edges2 = g2.edgeSet(); + final Set edges1 = g1.edgeSet(); + final Set edges2 = g2.edgeSet(); if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() ) return false; @@ -571,29 +589,35 @@ public class BaseGraph extends DefaultDirectedGraph graph2 ) { + return (this.getEdgeSource(edge1).seqEquals(graph2.getEdgeSource(edge2))) && (this.getEdgeTarget(edge1).seqEquals(graph2.getEdgeTarget(edge2))); + } + + /** * Get the incoming edge of v. Requires that there be only one such edge or throws an error * @param v our vertex * @return the single incoming edge to v, or null if none exists */ - public BaseEdge incomingEdgeOf(final T v) { + public E incomingEdgeOf(final V v) { return getSingletonEdge(incomingEdgesOf(v)); } @@ -602,7 +626,7 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph edges) { + private E getSingletonEdge(final Collection edges) { if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges); return edges.isEmpty() ? null : edges.iterator().next(); } @@ -625,12 +649,19 @@ public class BaseGraph extends DefaultDirectedGraph { +public final class DeBruijnGraph extends BaseGraph { + /** + * Edge factory that creates non-reference multiplicity 1 edges + */ + private static class MyEdgeFactory implements EdgeFactory { + @Override + public BaseEdge createEdge(DeBruijnVertex sourceVertex, DeBruijnVertex targetVertex) { + return new BaseEdge(false, 1); + } + } + /** * Create an empty DeBruijnGraph with default kmer size */ public DeBruijnGraph() { - super(); + this(11); } /** @@ -71,7 +82,7 @@ public final class DeBruijnGraph extends BaseGraph { * @param kmerSize kmer size, must be >= 1 */ public DeBruijnGraph(int kmerSize) { - super(kmerSize); + super(kmerSize, new MyEdgeFactory()); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java index c240949d9..4d9441efe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java @@ -54,7 +54,7 @@ import com.google.java.contract.Ensures; * User: ebanks, mdepristo * Date: Mar 23, 2011 */ -public final class DeBruijnVertex extends BaseVertex { +public class DeBruijnVertex extends BaseVertex { private final static byte[][] sufficesAsByteArray = new byte[256][]; static { for ( int i = 0; i < sufficesAsByteArray.length; i++ ) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java index 30c5be190..4aa6047a9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.collections.PrimitivePair; import java.util.ArrayList; import java.util.Collection; @@ -60,7 +61,7 @@ import java.util.List; * Date: 3/25/13 * Time: 9:42 PM */ -final class GraphUtils { +final public class GraphUtils { private GraphUtils() {} /** @@ -135,4 +136,49 @@ final class GraphUtils { return min; } + /** + * Find the ending position of the longest uniquely matching + * run of bases of kmer in seq. + * + * for example, if seq = ACGT and kmer is NAC, this function returns 1,2 as we have the following + * match: + * + * 0123 + * .ACGT + * NAC.. + * + * @param seq a non-null sequence of bytes + * @param kmer a non-null kmer + * @return the ending position and length where kmer matches uniquely in sequence, or null if no + * unique longest match can be found + */ + public static PrimitivePair.Int findLongestUniqueSuffixMatch(final byte[] seq, final byte[] kmer) { + int longestPos = -1; + int length = 0; + boolean foundDup = false; + + for ( int i = 0; i < seq.length; i++ ) { + final int matchSize = longestSuffixMatch(seq, kmer, i); + if ( matchSize > length ) { + longestPos = i; + length = matchSize; + foundDup = false; + } else if ( matchSize == length ) { + foundDup = true; + } + } + + return foundDup ? null : new PrimitivePair.Int(longestPos, length); + } + + private static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) { + for ( int len = 1; len <= kmer.length; len++ ) { + final int seqI = seqStart - len + 1; + final int kmerI = kmer.length - len; + if ( seqI < 0 || seq[seqI] != kmer[kmerI] ) { + return len - 1; + } + } + return kmer.length; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java index 466148588..3ba85dd92 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java @@ -59,7 +59,7 @@ import java.util.*; * User: ebanks, rpoplin, mdepristo * Date: Mar 23, 2011 */ -public class KBestPaths { +public class KBestPaths { private final boolean allowCycles; /** @@ -93,7 +93,7 @@ public class KBestPaths { /** * @see #getKBestPaths(BaseGraph, int) retriving the best 1000 paths */ - public List> getKBestPaths( final BaseGraph graph ) { + public List> getKBestPaths( final BaseGraph graph ) { return getKBestPaths(graph, 1000); } @@ -101,28 +101,28 @@ public class KBestPaths { * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) retriving the first 1000 paths * starting from all source vertices and ending with all sink vertices */ - public List> getKBestPaths( final BaseGraph graph, final int k ) { + public List> getKBestPaths( final BaseGraph graph, final int k ) { return getKBestPaths(graph, k, graph.getSources(), graph.getSinks()); } /** * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000 */ - public List> getKBestPaths( final BaseGraph graph, final Set sources, final Set sinks ) { + public List> getKBestPaths( final BaseGraph graph, final Set sources, final Set sinks ) { return getKBestPaths(graph, 1000, sources, sinks); } /** * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000 */ - public List> getKBestPaths( final BaseGraph graph, final T source, final T sink ) { + public List> getKBestPaths( final BaseGraph graph, final T source, final T sink ) { return getKBestPaths(graph, 1000, source, sink); } /** * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with singleton source and sink sets */ - public List> getKBestPaths( final BaseGraph graph, final int k, final T source, final T sink ) { + public List> getKBestPaths( final BaseGraph graph, final int k, final T source, final T sink ) { return getKBestPaths(graph, k, Collections.singleton(source), Collections.singleton(sink)); } @@ -136,20 +136,20 @@ public class KBestPaths { * @return a list with at most k top-scoring paths from the graph */ @Ensures({"result != null", "result.size() <= k"}) - public List> getKBestPaths( final BaseGraph graph, final int k, final Set sources, final Set sinks ) { + public List> getKBestPaths( final BaseGraph graph, final int k, final Set sources, final Set sinks ) { if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); } // a min max queue that will collect the best k paths - final MinMaxPriorityQueue> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create(); + final MinMaxPriorityQueue> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create(); // run a DFS for best paths for ( final T source : sources ) { - final Path startingPath = new Path(source, graph); + final Path startingPath = new Path(source, graph); findBestPaths(startingPath, sinks, bestPaths, new MyInt()); } // the MinMaxPriorityQueue iterator returns items in an arbitrary order, so we need to sort the final result - final List> toReturn = new ArrayList>(bestPaths); + final List> toReturn = new ArrayList>(bestPaths); Collections.sort(toReturn, new PathComparatorTotalScore()); return toReturn; } @@ -161,21 +161,21 @@ public class KBestPaths { * @param bestPaths a path to collect completed paths. * @param n used to limit the search by tracking the number of vertices visited across all paths */ - private void findBestPaths( final Path path, final Set sinks, final Collection> bestPaths, final MyInt n ) { + private void findBestPaths( final Path path, final Set sinks, final Collection> bestPaths, final MyInt n ) { if ( sinks.contains(path.getLastVertex())) { bestPaths.add(path); } else if( n.val > 10000 ) { // do nothing, just return, as we've done too much work already } else { // recursively run DFS - final ArrayList edgeArrayList = new ArrayList(path.getOutgoingEdgesOfLastVertex()); + final ArrayList edgeArrayList = new ArrayList(path.getOutgoingEdgesOfLastVertex()); Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator()); - for ( final BaseEdge edge : edgeArrayList ) { + for ( final E edge : edgeArrayList ) { final T target = path.getGraph().getEdgeTarget(edge); // make sure the edge is not already in the path final boolean alreadyVisited = allowCycles ? path.containsEdge(edge) : path.containsVertex(target); if ( ! alreadyVisited ) { - final Path newPath = new Path(path, edge); + final Path newPath = new Path(path, edge); n.val++; findBestPaths(newPath, sinks, bestPaths, n); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java new file mode 100644 index 000000000..7327b5736 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java @@ -0,0 +1,170 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import java.util.*; + +/** + /** + * Prune all chains from this graph where all edges in the path have multiplicity <= pruneFactor + * + * Unlike pruneGraph, this function will remove only linear chains in the graph where all edges have weight <= pruneFactor. + * + * For A -[1]> B -[1]> C -[1]> D would be removed with pruneFactor 1 + * but A -[1]> B -[2]> C -[1]> D would not be because the linear chain includes an edge with weight >= 2 + * + * User: depristo + * Date: 5/2/13 + * Time: 10:38 AM + */ +public class LowWeightChainPruner { + private final int pruneFactor; + + public LowWeightChainPruner(int pruneFactor) { + if ( pruneFactor < 0 ) throw new IllegalArgumentException("pruneFactor must be >= 0 but got " + pruneFactor); + this.pruneFactor = pruneFactor; + } + + /** + * Prune graph + * @param graph the graph to prune + */ + public void pruneLowWeightChains(final BaseGraph graph) { + if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null"); + + if ( pruneFactor > 0 ) { + final Set edgesToKeep = new LinkedHashSet<>(); + + for ( final Path linearChain : getLinearChains(graph) ) { + if( mustBeKeep(linearChain, pruneFactor) ) { + // we must keep edges in any path that contains a reference edge or an edge with weight > pruneFactor + edgesToKeep.addAll(linearChain.getEdges()); + } + } + + // we want to remove all edges not in the keep set + final Set edgesToRemove = new HashSet<>(graph.edgeSet()); + edgesToRemove.removeAll(edgesToKeep); + graph.removeAllEdges(edgesToRemove); + + graph.removeSingletonOrphanVertices(); + } + } + + /** + * Get the maximum pruning multiplicity seen on any edge in this graph + * @return an integer > 0 + */ + private boolean mustBeKeep(final Path path, final int pruneFactor) { + for ( final E edge : path.getEdges() ) { + if ( edge.getPruningMultiplicity() >= pruneFactor || edge.isRef() ) + return true; + } + return false; + } + + /** + * Get all of the linear chains in graph + * + * A linear chain is a series of vertices that start from either a source of a vertex with + * out-degree > 1 and extend through all vertices accessible via an outgoing edge from this + * vertex that have in == 1 and out degree of 0 or 1. + * + * @param graph the graph + * @return a non-null collection of paths in graph + */ + protected final Collection> getLinearChains(final BaseGraph graph) { + final Set chainStarts = new LinkedHashSet<>(); + + for ( final V v : graph.vertexSet() ) { + // we want a list of all chain start vertices. These are all vertices with out + // degree > 1, or all source vertices. + final int outDegree = graph.outDegreeOf(v); + final int inDegree = graph.inDegreeOf(v); + if ( outDegree > 1 || inDegree > 1 || (inDegree == 0 && outDegree > 0)) // don't add isolated vertices + chainStarts.add(v); + } + + // must be after since we can add duplicate starts in the above finding algorithm + final List> linearChains = new LinkedList<>(); + for ( final V chainStart : chainStarts ) { + for ( final E outEdge : graph.outgoingEdgesOf(chainStart) ) { + // these chains are composed of the starts + their next vertices + linearChains.add(extendLinearChain(new Path<>(new Path<>(chainStart, graph), outEdge))); + } + } + + return linearChains; + } + + /** + * Extend path while the last vertex has in and out degrees of 1 or 0 + * @param path the path to extend + * @return a fully extended linear path + */ + protected final Path extendLinearChain(final Path path) { + final V last = path.getLastVertex(); + final Set outEdges = path.getGraph().outgoingEdgesOf(last); + + final int outDegree = outEdges.size(); + final int inDegree = path.getGraph().inDegreeOf(last); + + if ( outDegree != 1 || inDegree > 1 ) { + // out next vertex has multiple outgoing edges, so we are done with the linear path + return path; + } else { + final V next = path.getGraph().getEdgeTarget(outEdges.iterator().next()); + if ( path.containsVertex(next) ) { + // we are done if the path contains a cycle + return path; + } else { + // we now know that last has outdegree == 1, so we keep extending the chain + return extendLinearChain(new Path<>(path, outEdges.iterator().next())); + } + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java new file mode 100644 index 000000000..c1937e5c8 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java @@ -0,0 +1,123 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +/** + * edge class for connecting nodes in the graph that tracks some per-sample information + * + * This class extends BaseEdge with the additional functionality of tracking the maximum + * multiplicity seen within any single sample. The workflow for using this class is: + * + * MultiSampleEdge e = new MultiSampleEdge(ref, 1) + * e.incMultiplicity(1) // total is 2, per sample is 2, max per sample is 1 + * e.getPruningMultiplicity() // = 1 + * e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2 + * e.getPruningMultiplicity() // = 2 + * e.incMultiplicity(3) // total is 5, per sample is 3, max per sample is 2 + * e.getPruningMultiplicity() // = 2 + * e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3 + * e.getPruningMultiplicity() // = 3 + */ +public class MultiSampleEdge extends BaseEdge { + private int maxSingleSampleMultiplicity, currentSingleSampleMultiplicity; + + /** + * Create a new MultiSampleEdge with weight multiplicity and, if isRef == true, indicates a path through the reference + * + * @param isRef indicates whether this edge is a path through the reference + * @param multiplicity the number of observations of this edge in this sample + */ + public MultiSampleEdge(final boolean isRef, final int multiplicity) { + super(isRef, multiplicity); + maxSingleSampleMultiplicity = multiplicity; + currentSingleSampleMultiplicity = multiplicity; + } + + @Override + public MultiSampleEdge copy() { + return new MultiSampleEdge(isRef(), getMultiplicity()); // TODO -- should I copy values for other features? + } + + /** + * update the max single sample multiplicity based on the current single sample multiplicity, and + * reset the current single sample multiplicity to 0. + */ + public void flushSingleSampleMultiplicity() { + if ( currentSingleSampleMultiplicity > maxSingleSampleMultiplicity ) + maxSingleSampleMultiplicity = currentSingleSampleMultiplicity; + currentSingleSampleMultiplicity = 0; + } + + @Override + public void incMultiplicity(final int incr) { + super.incMultiplicity(incr); + currentSingleSampleMultiplicity += incr; + } + + @Override + public int getPruningMultiplicity() { + return getMaxSingleSampleMultiplicity(); + } + + @Override + public String getDotLabel() { + return super.getDotLabel() + "/" + getMaxSingleSampleMultiplicity(); + } + + /** + * Get the maximum multiplicity for this edge seen in any single sample + * @return an integer >= 0 + */ + public int getMaxSingleSampleMultiplicity() { + return maxSingleSampleMultiplicity; + } + + /** only provided for testing purposes */ + protected int getCurrentSingleSampleMultiplicity() { + return currentSingleSampleMultiplicity; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index 47676a498..a07b98bb6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -52,8 +52,8 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.smithwaterman.Parameters; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.smithwaterman.*; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.util.*; @@ -68,40 +68,39 @@ import java.util.*; * Time: 2:34 PM * */ -public class Path { - private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20; +public class Path { + private final static String SW_PAD = "NNNNNNNNNN"; + private final static Logger logger = Logger.getLogger(Path.class); // the last vertex seen in the path private final T lastVertex; // the list of edges comprising the path - private Set edgesAsSet = null; - private final LinkedList edgesInOrder; + private Set edgesAsSet = null; + private final LinkedList edgesInOrder; // the scores for the path private final int totalScore; // the graph from which this path originated - private final BaseGraph graph; + private final BaseGraph graph; // used in the bubble state machine to apply Smith-Waterman to the bubble sequence // these values were chosen via optimization against the NA12878 knowledge base public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -15.0, -26.0, -1.1); - private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); - /** * Create a new Path containing no edges and starting at initialVertex * @param initialVertex the starting vertex of the path * @param graph the graph this path with follow through */ - public Path(final T initialVertex, final BaseGraph graph) { + public Path(final T initialVertex, final BaseGraph graph) { if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null"); if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph); lastVertex = initialVertex; - edgesInOrder = new LinkedList(); + edgesInOrder = new LinkedList(); totalScore = 0; this.graph = graph; } @@ -109,10 +108,10 @@ public class Path { /** * Convenience constructor for testing that creates a path through vertices in graph */ - protected static Path makePath(final List vertices, final BaseGraph graph) { - Path path = new Path(vertices.get(0), graph); + protected static Path makePath(final List vertices, final BaseGraph graph) { + Path path = new Path(vertices.get(0), graph); for ( int i = 1; i < vertices.size(); i++ ) - path = new Path(path, graph.getEdge(path.lastVertex, vertices.get(i))); + path = new Path(path, graph.getEdge(path.lastVertex, vertices.get(i))); return path; } @@ -122,7 +121,7 @@ public class Path { * @param p the path to extend * @param edge the edge to extend path by */ - public Path(final Path p, final BaseEdge edge) { + public Path(final Path p, final E edge) { if ( p == null ) throw new IllegalArgumentException("Path cannot be null"); if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null"); if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't"); @@ -130,7 +129,7 @@ public class Path { graph = p.graph; lastVertex = p.graph.getEdgeTarget(edge); - edgesInOrder = new LinkedList(p.getEdges()); + edgesInOrder = new LinkedList(p.getEdges()); edgesInOrder.add(edge); totalScore = p.totalScore + edge.getMultiplicity(); } @@ -139,7 +138,7 @@ public class Path { * Get the collection of edges leaving the last vertex of this path * @return a non-null collection */ - public Collection getOutgoingEdgesOfLastVertex() { + public Collection getOutgoingEdgesOfLastVertex() { return getGraph().outgoingEdgesOf(getLastVertex()); } @@ -148,12 +147,12 @@ public class Path { * @param edge the given edge to test * @return true if the edge is found in this path */ - public boolean containsEdge( final BaseEdge edge ) { + public boolean containsEdge( final E edge ) { if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } if ( edgesInOrder.isEmpty() ) return false; // initialize contains cache if necessary - if ( edgesAsSet == null ) edgesAsSet = new HashSet(edgesInOrder); + if ( edgesAsSet == null ) edgesAsSet = new HashSet(edgesInOrder); return edgesAsSet.contains(edge); } @@ -175,7 +174,7 @@ public class Path { * @param path the other path we might be the same as * @return true if this and path are the same */ - protected boolean pathsAreTheSame(Path path) { + protected boolean pathsAreTheSame(Path path) { return totalScore == path.totalScore && edgesInOrder.equals(path.edgesInOrder); } @@ -199,7 +198,7 @@ public class Path { * @return a non-null graph */ @Ensures("result != null") - public BaseGraph getGraph() { + public BaseGraph getGraph() { return graph; } @@ -208,7 +207,7 @@ public class Path { * @return a non-null list of edges */ @Ensures("result != null") - public List getEdges() { return edgesInOrder; } + public List getEdges() { return edgesInOrder; } /** * Get the list of vertices in this path in order defined by the edges of the path @@ -221,7 +220,7 @@ public class Path { else { final LinkedList vertices = new LinkedList(); boolean first = true; - for ( final BaseEdge e : getEdges() ) { + for ( final E e : getEdges() ) { if ( first ) { vertices.add(graph.getEdgeSource(e)); first = false; @@ -246,6 +245,14 @@ public class Path { @Ensures("result != null") public T getLastVertex() { return lastVertex; } + /** + * Get the first vertex in this path + * @return a non-null vertex + */ + public T getFirstVertex() { + return getGraph().getEdgeSource(edgesInOrder.pollFirst()); + } + /** * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes * @return non-null sequence of bases corresponding to this path @@ -255,174 +262,114 @@ public class Path { if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); } byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.getFirst())); - for( final BaseEdge e : edgesInOrder ) { + for( final E e : edgesInOrder ) { bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); } return bases; } /** - * Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble - * @return non-null Cigar string with reference length equal to the refHaplotype's reference length + * Calculate the cigar elements for this path against the reference sequence + * + * @param refSeq the reference sequence that all of the bases in this path should align to + * @return a Cigar mapping this path to refSeq, or null if no reasonable alignment could be found */ - @Ensures("result != null") - public Cigar calculateCigar() { - final Cigar cigar = new Cigar(); - // special case for paths that start on reference but not at the reference source node - if( edgesInOrder.getFirst().isRef() && !graph.isRefSource(edgesInOrder.getFirst()) ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edgesInOrder.getFirst())).getCigarElements() ) { - cigar.add(ce); - } + public Cigar calculateCigar(final byte[] refSeq) { + if ( getBases().length == 0 ) { + // horrible edge case from the unit tests, where this path has no bases + return new Cigar(Arrays.asList(new CigarElement(refSeq.length, CigarOperator.D))); } - // reset the bubble state machine - final BubbleStateMachine bsm = new BubbleStateMachine(cigar); + final byte[] bases = getBases(); + final Cigar nonStandard; - for( final BaseEdge e : getEdges() ) { - if ( e.hasSameSourceAndTarget(graph, edgesInOrder.getFirst()) ) { - advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); - } - advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e ); + final String paddedRef = SW_PAD + new String(refSeq) + SW_PAD; + final String paddedPath = SW_PAD + new String(bases) + SW_PAD; + final SmithWaterman alignment = new SWPairwiseAlignment( paddedRef.getBytes(), paddedPath.getBytes(), NEW_SW_PARAMETERS ); + + if ( isSWFailure(alignment) ) + return null; + + // cut off the padding bases + final int baseStart = SW_PAD.length(); + final int baseEnd = paddedPath.length() - SW_PAD.length() - 1; // -1 because it's inclusive + nonStandard = AlignmentUtils.trimCigarByBases(alignment.getCigar(), baseStart, baseEnd); + + if ( nonStandard.getReferenceLength() != refSeq.length ) { + nonStandard.add(new CigarElement(refSeq.length - nonStandard.getReferenceLength(), CigarOperator.D)); } - // special case for paths that don't end on reference - if( bsm.inBubble ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) { - bsm.cigar.add(ce); - } - } else if( edgesInOrder.getLast().isRef() && !graph.isRefSink(edgesInOrder.getLast()) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit - for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edgesInOrder.getLast()), null).getCigarElements() ) { - bsm.cigar.add(ce); - } - } - - return AlignmentUtils.consolidateCigar(bsm.cigar); + // finally, return the cigar with all indels left aligned + return leftAlignCigarSequentially(nonStandard, refSeq, getBases(), 0, 0); } /** - * Advance the bubble state machine by incorporating the next node in the path. - * @param bsm the current bubble state machine - * @param node the node to be incorporated - * @param e the edge which generated this node in the path + * Make sure that the SW didn't fail in some terrible way, and throw exception if it did */ - @Requires({"bsm != null", "graph != null", "node != null"}) - private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final T node, final BaseEdge e ) { - if( graph.isReferenceNode( node ) ) { - if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else - if( e !=null && !e.isRef() ) { - if( graph.referencePathExists( graph.getEdgeSource(e), node) ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) { - bsm.cigar.add(ce); - } - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); - } else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) ); - } else { - bsm.inBubble = true; - bsm.bubbleBytes = null; - bsm.lastSeenReferenceNode = graph.getEdgeSource(e); - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } - } else { - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); - } - } else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } else { // close the bubble and use a local SW to determine the Cigar string - for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) { - bsm.cigar.add(ce); - } - bsm.inBubble = false; - bsm.bubbleBytes = null; - bsm.lastSeenReferenceNode = null; - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); - } - } else { // non-ref vertex - if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } else { // open up a bubble - bsm.inBubble = true; - bsm.bubbleBytes = null; - bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null ); - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } + private boolean isSWFailure(final SmithWaterman alignment) { + // check that the alignment starts at the first base, which it should given the padding + if ( alignment.getAlignmentStart2wrt1() > 0 ) { + return true; +// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should always start at 0, but got " + alignment.getAlignmentStart2wrt1() + " with cigar " + alignment.getCigar()); } + + // check that we aren't getting any S operators (which would be very bad downstream) + for ( final CigarElement ce : alignment.getCigar().getCigarElements() ) { + if ( ce.getOperator() == CigarOperator.S ) + return true; + // soft clips at the end of the alignment are really insertions +// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should never contain S operators but got cigar " + alignment.getCigar()); + } + + return false; } /** - * Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble - * @param bubbleBytes the bytes that comprise the alternate allele path in this bubble - * @param fromVertex the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex) - * @param toVertex the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex) - * @return the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble + * Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them. + * This is a target of future work to incorporate and generalize into AlignmentUtils for use by others. + * @param cigar the cigar to left align + * @param refSeq the reference byte array + * @param readSeq the read byte array + * @param refIndex 0-based alignment start position on ref + * @param readIndex 0-based alignment start position on read + * @return the left-aligned cigar */ - @Requires({"graph != null"}) - @Ensures({"result != null"}) - private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) { - final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null); - - final Cigar returnCigar = new Cigar(); - - // add padding to anchor ref/alt bases in the SW matrix - byte[] padding = STARTING_SW_ANCHOR_BYTES; - boolean goodAlignment = false; - SWPairwiseAlignment swConsensus = null; - while( !goodAlignment && padding.length < 1000 ) { - padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time - final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding ); - final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding ); - swConsensus = new SWPairwiseAlignment( reference, alternate, NEW_SW_PARAMETERS ); - if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) { - goodAlignment = true; + @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) + protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + final Cigar cigarToReturn = new Cigar(); + Cigar cigarToAlign = new Cigar(); + for (int i = 0; i < cigar.numCigarElements(); i++) { + final CigarElement ce = cigar.getCigarElement(i); + if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { + cigarToAlign.add(ce); + final Cigar leftAligned = AlignmentUtils.leftAlignSingleIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false); + for ( final CigarElement toAdd : leftAligned.getCigarElements() ) { cigarToReturn.add(toAdd); } + refIndex += cigarToAlign.getReferenceLength(); + readIndex += cigarToAlign.getReadLength(); + cigarToAlign = new Cigar(); + } else { + cigarToAlign.add(ce); } } - if( !goodAlignment ) { - returnCigar.add(new CigarElement(1, CigarOperator.N)); - return returnCigar; - } - - final Cigar swCigar = swConsensus.getCigar(); - if( swCigar.numCigarElements() > MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW ) { // this bubble is too divergent from the reference - returnCigar.add(new CigarElement(1, CigarOperator.N)); - } else { - for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { - // now we need to remove the padding from the cigar string - int length = swCigar.getCigarElement(iii).getLength(); - if( iii == 0 ) { length -= padding.length; } - if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; } - if( length > 0 ) { - returnCigar.add(new CigarElement(length, swCigar.getCigarElement(iii).getOperator())); - } - } - if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) { - throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar()); + if( !cigarToAlign.isEmpty() ) { + for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) { + cigarToReturn.add(toAdd); } } - return returnCigar; + final Cigar result = AlignmentUtils.consolidateCigar(cigarToReturn); + if( result.getReferenceLength() != cigar.getReferenceLength() ) + throw new IllegalStateException("leftAlignCigarSequentially failed to produce a valid CIGAR. Reference lengths differ. Initial cigar " + cigar + " left aligned into " + result); + return result; } - // class to keep track of the bubble state machine - private static class BubbleStateMachine { - public boolean inBubble = false; - public byte[] bubbleBytes = null; - public T lastSeenReferenceNode = null; - public Cigar cigar = null; - - public BubbleStateMachine( final Cigar initialCigar ) { - inBubble = false; - bubbleBytes = null; - lastSeenReferenceNode = null; - cigar = initialCigar; - } - } /** * Tests that this and other have the same score and vertices in the same order with the same seq * @param other the other path to consider. Cannot be null * @return true if this and path are equal, false otherwise */ - public boolean equalScoreAndSequence(final Path other) { + public boolean equalScoreAndSequence(final Path other) { if ( other == null ) throw new IllegalArgumentException("other cannot be null"); return getScore() == other.getScore() && equalSequence(other); } @@ -432,7 +379,7 @@ public class Path { * @param other the other path to consider. Cannot be null * @return true if this and path are equal, false otherwise */ - public boolean equalSequence(final Path other) { + public boolean equalSequence(final Path other) { final List mine = getVertices(); final List yours = other.getVertices(); if ( mine.size() == yours.size() ) { // hehehe diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index bb4b26257..20edcb39b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.jgrapht.EdgeFactory; import java.io.File; import java.util.HashSet; @@ -61,7 +62,17 @@ import java.util.Set; * @author: depristo * @since 03/2013 */ -public final class SeqGraph extends BaseGraph { +public final class SeqGraph extends BaseGraph { + /** + * Edge factory that creates non-reference multiplicity 1 edges + */ + private static class MyEdgeFactory implements EdgeFactory { + @Override + public BaseEdge createEdge(SeqVertex sourceVertex, SeqVertex targetVertex) { + return new BaseEdge(false, 1); + } + } + private final static boolean PRINT_SIMPLIFY_GRAPHS = false; /** @@ -82,7 +93,7 @@ public final class SeqGraph extends BaseGraph { * Construct an empty SeqGraph */ public SeqGraph() { - super(); + this(11); } /** @@ -94,7 +105,7 @@ public final class SeqGraph extends BaseGraph { * @param kmer kmer */ public SeqGraph(final int kmer) { - super(kmer); + super(kmer, new MyEdgeFactory()); } /** @@ -154,7 +165,6 @@ public final class SeqGraph extends BaseGraph { didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0); - didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); didSomeWork |= zipLinearChains(); return didSomeWork; } @@ -289,8 +299,8 @@ public final class SeqGraph extends BaseGraph { final BaseEdge inc = new BaseEdge(false, sharedWeightAmongEdges); // template to make .add function call easy // update the incoming and outgoing edges to point to the new vertex - for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge).add(inc)); } - for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge).add(inc)); } + for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy().add(inc)); } + for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy().add(inc)); } removeAllVertices(linearChain); return true; @@ -505,40 +515,4 @@ public final class SeqGraph extends BaseGraph { } } } - - /** - * Merge headless configurations: - * - * Performs the transformation: - * - * { x + S_i + y -> Z } - * - * goes to: - * - * { x -> S_i -> y -> Z } - * - * for all nodes that match this configuration. - * - * Differs from the diamond transform in that no top node is required - */ - protected class MergeHeadlessIncomingSources extends VertexBasedTransformer { - @Override - boolean tryToTransform(final SeqVertex bottom) { - final Set incoming = incomingVerticesOf(bottom); - if ( incoming.size() <= 1 ) - return false; - - for ( final SeqVertex inc : incoming ) - if ( ! isSource(inc) || outDegreeOf(inc) > 1 ) - return false; - - if ( dontModifyGraphEvenIfPossible() ) return true; - - final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming); - if (splitter.meetsMinMergableSequenceForPrefix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES)) - return splitter.splitAndUpdate(null, bottom); - else - return false; - } - } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java index 1c53f2332..0babd8d56 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java @@ -88,13 +88,13 @@ public class SharedSequenceMerger { for ( final SeqVertex prev : prevs ) { for ( final BaseEdge prevIn : graph.incomingEdgesOf(prev) ) { - graph.addEdge(graph.getEdgeSource(prevIn), newV, new BaseEdge(prevIn)); + graph.addEdge(graph.getEdgeSource(prevIn), newV, prevIn.copy()); edgesToRemove.add(prevIn); } } for ( final BaseEdge e : graph.outgoingEdgesOf(v) ) { - graph.addEdge(newV, graph.getEdgeTarget(e), new BaseEdge(e)); + graph.addEdge(newV, graph.getEdgeTarget(e), e.copy()); } graph.removeAllVertices(prevs); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java index f6ee4c3c3..205d0027a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java @@ -209,7 +209,7 @@ public class SharedVertexSequenceSplitter { splitGraph.addEdge(remaining, suffixV, fromMid); } else { // prefix + suffix completely explain this node - splitGraph.addOrUpdateEdge(prefixV, suffixV, new BaseEdge(toMid).add(fromMid)); + splitGraph.addOrUpdateEdge(prefixV, suffixV, toMid.copy().add(fromMid)); } } } @@ -323,7 +323,7 @@ public class SharedVertexSequenceSplitter { } else { // schedule edge for removal, and return a freshly allocated one for our graph to use edgesToRemove.add(e); - return new BaseEdge(e); + return e.copy(); } } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java new file mode 100644 index 000000000..814b3b9a7 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java @@ -0,0 +1,118 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnVertex; +import org.broadinstitute.sting.utils.Utils; + +import java.util.LinkedList; +import java.util.List; + +/** + * A DeBruijnVertex that supports multiple copies of the same kmer + * + * This is implemented through the same mechanism as SeqVertex, where each + * created MultiDeBruijnVertex has a unique id assigned upon creation. Two + * MultiDeBruijnVertex are equal iff they have the same ID + * + * User: depristo + * Date: 4/17/13 + * Time: 3:20 PM + */ +final class MultiDeBruijnVertex extends DeBruijnVertex { + private final static boolean KEEP_TRACK_OF_READS = false; + private static int idCounter = 0; + + private final List reads = new LinkedList(); + private int id = idCounter++; // TODO -- potential race condition problem here + + /** + * Create a new MultiDeBruijnVertex with kmer sequence + * @param sequence the kmer sequence + */ + MultiDeBruijnVertex(byte[] sequence) { + super(sequence); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + MultiDeBruijnVertex that = (MultiDeBruijnVertex) o; + + return id == that.id; + } + + @Override + public String toString() { + return "MultiDeBruijnVertex_id_" + id + "_seq_" + getSequenceString(); + } + + /** + * Add name information to this vertex for debugging + * + * This information will be captured as a list of strings, and displayed in DOT if this + * graph is written out to disk + * + * This functionality is only enabled when KEEP_TRACK_OF_READS is true + * + * @param name a non-null string + */ + protected void addRead(final String name) { + if ( name == null ) throw new IllegalArgumentException("name cannot be null"); + if ( KEEP_TRACK_OF_READS ) reads.add(name); + } + + @Override + public int hashCode() { return id; } + + @Override + public String additionalInfo() { + return KEEP_TRACK_OF_READS ? (! reads.contains("ref") ? "__" + Utils.join(",", reads) : "") : ""; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java new file mode 100644 index 000000000..db0ce0880 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -0,0 +1,162 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +public class ReadThreadingAssembler extends LocalAssemblyEngine { + private final static Logger logger = Logger.getLogger(ReadThreadingAssembler.class); + + private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128; + + /** The min and max kmer sizes to try when building the graph. */ + private final List kmerSizes; + private final int maxAllowedPathsForReadThreadingAssembler; + + private boolean requireReasonableNumberOfPaths = false; + protected boolean removePathsNotConnectedToRef = true; + private boolean justReturnRawGraph = false; + + /** for testing only */ + public ReadThreadingAssembler() { + this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25)); + } + + public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List kmerSizes) { + super(maxAllowedPathsForReadThreadingAssembler); + this.kmerSizes = kmerSizes; + this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler; + } + + /** for testing purposes */ + protected void setJustReturnRawGraph(boolean justReturnRawGraph) { + this.justReturnRawGraph = justReturnRawGraph; + } + + @Override + public List assemble( final List reads, final Haplotype refHaplotype) { + final List graphs = new LinkedList<>(); + + for ( final int kmerSize : kmerSizes ) { + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly); + + // add the reference sequence to the graph + rtgraph.addSequence("ref", refHaplotype.getBases(), null, true); + + // Next pull kmers out of every read and throw them on the graph + for( final GATKSAMRecord read : reads ) { + rtgraph.addRead(read); + } + + // actually build the read threading graph + rtgraph.buildGraphIfNecessary(); + if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.0.raw_readthreading_graph.dot"), pruneFactor); + + // go through and prune all of the chains where all edges have <= pruneFactor. This must occur + // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering + // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 + rtgraph.pruneLowWeightChains(pruneFactor); + + // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if + // we can recover them by merging some N bases from the chain back into the reference uniquely, for + // N < kmerSize + if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(); + + // remove all heading and trailing paths + if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); + + if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"), pruneFactor); + + final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); + + // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph + if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph); + + if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); + if ( debugGraphTransformations ) initialSeqGraph.printGraph(new File("sequenceGraph.0.2.initial_seqgraph.dot"), pruneFactor); + initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction + + final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); + if ( seqGraph != null ) { + if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) { + graphs.add(seqGraph); + } + } + } + + return graphs; + } + + /** + * Did we find a reasonable number of paths in this graph? + * @param graph + * @return + */ + private boolean reasonableNumberOfPaths(final SeqGraph graph) { + final KBestPaths pathFinder = new KBestPaths(false); + final List> allPaths = pathFinder.getKBestPaths(graph, 100000); + logger.info("Found " + allPaths.size() + " paths through " + graph + " with maximum " + maxAllowedPathsForReadThreadingAssembler); + return allPaths.size() <= maxAllowedPathsForReadThreadingAssembler; + } + + @Override + public String toString() { + return "ReadThreadingAssembler{" + + "kmerSizes=" + kmerSizes + + '}'; + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java new file mode 100644 index 000000000..6e9223afb --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -0,0 +1,640 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.jgrapht.EdgeFactory; + +import java.io.File; +import java.util.*; + +public class ReadThreadingGraph extends BaseGraph { + /** + * Edge factory that creates non-reference multiplicity 1 edges + */ + private static class MyEdgeFactory implements EdgeFactory { + @Override + public MultiSampleEdge createEdge(MultiDeBruijnVertex sourceVertex, MultiDeBruijnVertex targetVertex) { + return new MultiSampleEdge(false, 1); + } + } + + private final static Logger logger = Logger.getLogger(ReadThreadingGraph.class); + + private final static String ANONYMOUS_SAMPLE = "XXX_UNNAMED_XXX"; + private final static boolean WRITE_GRAPH = false; + private final static boolean DEBUG_NON_UNIQUE_CALC = false; + + /** for debugging info printing */ + private static int counter = 0; + + /** we require at least this many bases to be uniquely matching to merge a dangling tail */ + private final static int MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL = 5; + + /** + * Sequences added for read threading before we've actually built the graph + */ + private final Map> pending = new LinkedHashMap>(); + + /** + * A set of non-unique kmers that cannot be used as merge points in the graph + */ + private Set nonUniqueKmers; + + /** + * A map from kmers -> their corresponding vertex in the graph + */ + private Map uniqueKmers = new LinkedHashMap(); + + /** + * + */ + final int kmerSize; + final boolean debugGraphTransformations; + final byte minBaseQualityToUseInAssembly; + + protected boolean increaseCountsBackwards = true; + protected boolean increaseCountsThroughBranches = false; // this may increase the branches without bounds + + // -------------------------------------------------------------------------------- + // state variables, initialized in resetToInitialState() + // -------------------------------------------------------------------------------- + private Kmer refSource; + private boolean alreadyBuilt; + byte[] refSeq; + MultiDeBruijnVertex[] refKmers; + + public ReadThreadingGraph() { + this(25, false, (byte)6); + } + + public ReadThreadingGraph(final int kmerSize) { + this(kmerSize, false, (byte)6); + } + + /** + * Create a new ReadThreadingAssembler using kmerSize for matching + * @param kmerSize must be >= 1 + */ + protected ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly) { + super(kmerSize, new MyEdgeFactory()); + + if ( kmerSize < 1 ) throw new IllegalArgumentException("bad minkKmerSize " + kmerSize); + this.kmerSize = kmerSize; + this.debugGraphTransformations = debugGraphTransformations; + this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; + + resetToInitialState(); + } + + /** + * Reset this assembler to its initial state, so we can create another assembly with a different set of reads + */ + private void resetToInitialState() { + pending.clear(); + nonUniqueKmers = null; + uniqueKmers.clear(); + refSource = null; + alreadyBuilt = false; + refSeq = null; + refKmers = null; + } + + /** + * Add the all bases in sequence to the graph + * @param sequence a non-null sequence + * @param isRef is this the reference sequence? + */ + protected void addSequence(final byte[] sequence, final boolean isRef) { + addSequence("anonymous", sequence, null, isRef); + } + + /** + * Add all bases in sequence to this graph + * + * @see #addSequence(String, String, byte[], int, int, int[], boolean) for full information + */ + public void addSequence(final String seqName, final byte[] sequence, final int[] counts, final boolean isRef) { + addSequence(seqName, ANONYMOUS_SAMPLE, sequence, 0, sequence.length, counts, isRef); + } + + /** + * Add bases in sequence to this graph + * + * @param seqName a useful seqName for this read, for debugging purposes + * @param sequence non-null sequence of bases + * @param counts a vector of counts for each bases, indicating how many times that base was observed in the sequence. + * This allows us to support reduced reads in the ReadThreadingAssembler. Can be null, meaning that + * each base is only observed once. If not null, must have length == sequence.length. + * @param start the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive + * @param stop the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive + * @param isRef is this the reference sequence. + */ + public void addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int[] counts, final boolean isRef) { + // note that argument testing is taken care of in SequenceForKmers + if ( alreadyBuilt ) throw new IllegalStateException("Graph already built"); + + // get the list of sequences for this sample + List sampleSequences = pending.get(sampleName); + if ( sampleSequences == null ) { // need to create + sampleSequences = new LinkedList<>(); + pending.put(sampleName, sampleSequences); + } + + // add the new sequence to the list of sequences for sample + sampleSequences.add(new SequenceForKmers(seqName, sequence, start, stop, counts, isRef)); + } + + /** + * Return a count appropriate for a kmer starting at kmerStart in sequence for kmers + * + * @param seqForKmers a non-null sequence for kmers object + * @param kmerStart the position where the kmer starts in sequence + * @return a count for a kmer from start -> start + kmerSize in seqForKmers + */ + private int getCountGivenKmerStart(final SequenceForKmers seqForKmers, final int kmerStart) { + return seqForKmers.getCount(kmerStart + kmerSize - 1); + } + + /** + * Thread sequence seqForKmers through the current graph, updating the graph as appropriate + * @param seqForKmers a non-null sequence + */ + private void threadSequence(final SequenceForKmers seqForKmers) { + final Pair startingInfo = findStart(seqForKmers); + if ( startingInfo == null ) + return; + + final MultiDeBruijnVertex startingVertex = startingInfo.getFirst(); + final int uniqueStartPos = startingInfo.getSecond(); + + // increase the counts of all edges incoming into the starting vertex supported by going back in sequence + if ( increaseCountsBackwards ) + increaseCountsInMatchedKmers(seqForKmers, startingVertex, startingVertex.getSequence(), kmerSize - 2); + + if ( debugGraphTransformations ) startingVertex.addRead(seqForKmers.name); + + // keep track of information about the reference kmers for merging dangling tails + if ( seqForKmers.isRef ) { + if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev " + refSource + " new is " + startingVertex); + refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize); + refSeq = seqForKmers.sequence; + refKmers = new MultiDeBruijnVertex[refSeq.length]; + for ( int i = 0; i < kmerSize; i++ ) refKmers[i] = null; + } + + // loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate + MultiDeBruijnVertex vertex = startingVertex; + for ( int i = uniqueStartPos + 1; i <= seqForKmers.stop - kmerSize; i++ ) { + final int count = getCountGivenKmerStart(seqForKmers, i); + + vertex = extendChainByOne(vertex, seqForKmers.sequence, i, count, seqForKmers.isRef); + if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name); + + // keep track of the reference kmers for merging dangling tails + if ( seqForKmers.isRef ) refKmers[i + kmerSize - 1] = vertex; + } + } + + /** + * Attempt to attach vertex with out-degree == 0 to the graph by finding a unique matching kmer to the reference + * @param vertex the vertex to recover + */ + protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) { + if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0"); + + final byte[] kmer = vertex.getSequence(); + if ( ! nonUniqueKmers.contains(new Kmer(kmer)) ) { + // don't attempt to fix non-unique kmers! + final MultiDeBruijnVertex uniqueMergePoint = danglingTailMergePoint(kmer); + if ( uniqueMergePoint != null ) { + addEdge(vertex, uniqueMergePoint, new MultiSampleEdge(false, 1)); + return 1; + } + } + + return 0; + } + + /** + * Find a unique merge point for kmer in the reference sequence + * @param kmer the full kmer of the dangling tail + * @return a vertex appropriate to merge kmer into, or null if none could be found + */ + private MultiDeBruijnVertex danglingTailMergePoint(final byte[] kmer) { + final PrimitivePair.Int endAndLength = GraphUtils.findLongestUniqueSuffixMatch(refSeq, kmer); + if ( endAndLength != null && endAndLength.second >= MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL && endAndLength.first + 1 < refKmers.length) { + final int len = endAndLength.second; + final MultiDeBruijnVertex mergePoint = refKmers[endAndLength.first + 1]; +// logger.info("recoverDanglingChain of kmer " + new String(kmer) + " merged to " + mergePoint + " with match size " + len); + final Set nonUniquesAtLength = determineKmerSizeAndNonUniques(len, len).nonUniques; + final Kmer matchedKmer = new Kmer(kmer, kmer.length - len, len); + if ( nonUniquesAtLength.contains(matchedKmer) ) { +// logger.info("Rejecting merge " + new String(kmer) + " because match kmer " + matchedKmer + " isn't unique across all reads"); + return null; + } else { + return mergePoint; + } + } + + return null; + } + + /** + * Build the read threaded assembly graph if it hasn't already been constructed from the sequences that have + * been added to the graph. + */ + public void buildGraphIfNecessary() { + if ( alreadyBuilt ) return; + + // determine the kmer size we'll uses, and capture the set of nonUniques for that kmer size + final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize); + nonUniqueKmers = result.nonUniques; + + if ( DEBUG_NON_UNIQUE_CALC ) { + logger.info("using " + kmerSize + " kmer size for this assembly with the following non-uniques"); + } + + // go through the pending sequences, and add them to the graph + for ( final List sequencesForSample : pending.values() ) { + for ( final SequenceForKmers sequenceForKmers : sequencesForSample ) { + threadSequence(sequenceForKmers); + if ( WRITE_GRAPH ) printGraph(new File("threading." + counter++ + "." + sequenceForKmers.name.replace(" ", "_") + ".dot"), 0); + } + + // flush the single sample edge values from the graph + for ( final MultiSampleEdge e : edgeSet() ) e.flushSingleSampleMultiplicity(); + } + + // clear + pending.clear(); + alreadyBuilt = true; + } + + public void recoverDanglingTails() { + if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); + + int attempted = 0; + int nRecovered = 0; + for ( final MultiDeBruijnVertex v : vertexSet() ) { + if ( outDegreeOf(v) == 0 && ! isRefNodeAndRefSink(v) ) { + attempted++; + nRecovered += recoverDanglingChain(v); + } + } + //logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails"); + } + + /** structure that keeps track of the non-unique kmers for a given kmer size */ + private static class NonUniqueResult { + final Set nonUniques; + final int kmerSize; + + private NonUniqueResult(Set nonUniques, int kmerSize) { + this.nonUniques = nonUniques; + this.kmerSize = kmerSize; + } + } + + /** + * Compute the smallest kmer size >= minKmerSize and <= maxKmerSize that has no non-unique kmers + * among all sequences added to the current graph. Will always return a result for maxKmerSize if + * all smaller kmers had non-unique kmers. + * + * @param minKmerSize the minimum kmer size to consider when constructing the graph + * @param maxKmerSize the maximum kmer size to consider + * @return a non-null NonUniqueResult + */ + protected NonUniqueResult determineKmerSizeAndNonUniques(final int minKmerSize, final int maxKmerSize) { + final Collection withNonUniques = getAllPendingSequences(); + final Set nonUniqueKmers = new HashSet(); + + // go through the sequences and determine which kmers aren't unique within each read + int kmerSize = minKmerSize; + for ( ; kmerSize <= maxKmerSize; kmerSize++) { + // clear out set of non-unique kmers + nonUniqueKmers.clear(); + + // loop over all sequences that have non-unique kmers in them from the previous iterator + final Iterator it = withNonUniques.iterator(); + while ( it.hasNext() ) { + final SequenceForKmers sequenceForKmers = it.next(); + + // determine the non-unique kmers for this sequence + final Collection nonUniquesFromSeq = determineNonUniqueKmers(sequenceForKmers, kmerSize); + if ( nonUniquesFromSeq.isEmpty() ) { + // remove this sequence from future consideration + it.remove(); + } else { + // keep track of the non-uniques for this kmerSize, and keep it in the list of sequences that have non-uniques + nonUniqueKmers.addAll(nonUniquesFromSeq); + } + } + + if ( nonUniqueKmers.isEmpty() ) + // this kmerSize produces no non-unique sequences, so go ahead and use it for our assembly + break; + } + + // necessary because the loop breaks with kmerSize = max + 1 + return new NonUniqueResult(nonUniqueKmers, Math.min(kmerSize, maxKmerSize)); + } + + /** + * Get the collection of all sequences for kmers across all samples in no particular order + * @return non-null Collection + */ + private Collection getAllPendingSequences() { + final LinkedList result = new LinkedList(); + for ( final List oneSampleWorth : pending.values() ) result.addAll(oneSampleWorth); + return result; + } + + /** + * Get the collection of non-unique kmers from sequence for kmer size kmerSize + * @param seqForKmers a sequence to get kmers from + * @param kmerSize the size of the kmers + * @return a non-null collection of non-unique kmers in sequence + */ + private Collection determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) { + // count up occurrences of kmers within each read + final KMerCounter counter = new KMerCounter(kmerSize); + for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) { + final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize); + counter.addKmer(kmer, 1); + } + + return counter.getKmersWithCountsAtLeast(2); + } + + /** + * Convert this kmer graph to a simple sequence graph. + * + * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer + * graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence + * + * @return a newly allocated SequenceGraph + */ + // TODO -- should override base class method + public SeqGraph convertToSequenceGraph() { + buildGraphIfNecessary(); + + final SeqGraph seqGraph = new SeqGraph(kmerSize); + final Map vertexMap = new HashMap(); + + // create all of the equivalent seq graph vertices + for ( final MultiDeBruijnVertex dv : vertexSet() ) { + final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv))); + sv.setAdditionalInfo(dv.additionalInfo()); + vertexMap.put(dv, sv); + seqGraph.addVertex(sv); + } + + // walk through the nodes and connect them to their equivalent seq vertices + for( final MultiSampleEdge e : edgeSet() ) { + final SeqVertex seqInV = vertexMap.get(getEdgeSource(e)); + final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e)); + //logger.info("Adding edge " + seqInV + " -> " + seqOutV); + seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity())); + } + + return seqGraph; + } + + private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers, + final MultiDeBruijnVertex vertex, + final byte[] originalKmer, + final int offset) { + if ( offset == -1 ) return; + + for ( final MultiSampleEdge edge : incomingEdgesOf(vertex) ) { + final MultiDeBruijnVertex prev = getEdgeSource(edge); + final byte suffix = prev.getSuffix(); + final byte seqBase = originalKmer[offset]; +// logger.warn(String.format("Increasing counts for %s -> %s via %s at %d with suffix %s vs. %s", +// prev, vertex, edge, offset, (char)suffix, (char)seqBase)); + if ( suffix == seqBase && (increaseCountsThroughBranches || inDegreeOf(vertex) == 1) ) { + edge.incMultiplicity(seqForKmers.getCount(offset)); + increaseCountsInMatchedKmers(seqForKmers, prev, originalKmer, offset-1); + } + } + } + + /** + * Find vertex and its position in seqForKmers where we should start assembling seqForKmers + * + * @param seqForKmers the sequence we want to thread into the graph + * @return a pair of the starting vertex and its position in seqForKmer + */ + private Pair findStart(final SequenceForKmers seqForKmers) { + final int uniqueStartPos = seqForKmers.isRef ? 0 : findUniqueStartPosition(seqForKmers.sequence, seqForKmers.start, seqForKmers.stop); + + if ( uniqueStartPos == -1 ) + return null; + + return getOrCreateKmerVertex(seqForKmers.sequence, uniqueStartPos, true); + } + + /** + * Find a starting point in sequence that begins a unique kmer among all kmers in the graph + * @param sequence the sequence of bases + * @param start the first base to use in sequence + * @param stop the last base to use in sequence + * @return the index into sequence that begins a unique kmer of size kmerSize, or -1 if none could be found + */ + private int findUniqueStartPosition(final byte[] sequence, final int start, final int stop) { + for ( int i = start; i < stop - kmerSize; i++ ) { + final Kmer kmer1 = new Kmer(sequence, i, kmerSize); + if ( uniqueKmers.containsKey(kmer1) ) + return i; + } + return -1; + } + + /** + * Get the vertex for the kmer in sequence starting at start + * @param sequence the sequence + * @param start the position of the kmer start + * @param allowRefSource if true, we will allow matches to the kmer that represents the reference starting kmer + * @return a non-null vertex + */ + private Pair getOrCreateKmerVertex(final byte[] sequence, final int start, final boolean allowRefSource) { + final Kmer kmer = new Kmer(sequence, start, kmerSize); + final MultiDeBruijnVertex vertex = getUniqueKmerVertex(kmer, allowRefSource); + if ( vertex != null ) { + return new Pair<>(vertex, start); + } else { + return new Pair<>(createVertex(kmer), start); + } + } + + /** + * Get the unique vertex for kmer, or null if not possible. + * + * @param allowRefSource if true, we will allow kmer to match the reference source vertex + * @return a vertex for kmer, or null if it's not unique + */ + private MultiDeBruijnVertex getUniqueKmerVertex(final Kmer kmer, final boolean allowRefSource) { + if ( ! allowRefSource && kmer.equals(refSource) ) return null; + return uniqueKmers.get(kmer); + } + + /** + * Create a new vertex for kmer. Add it to the uniqueKmers map if appropriate. + * + * kmer must not have a entry in unique kmers, or an error will be thrown + * + * @param kmer the kmer we want to create a vertex for + * @return the non-null created vertex + */ + private MultiDeBruijnVertex createVertex(final Kmer kmer) { + final MultiDeBruijnVertex newVertex = new MultiDeBruijnVertex(kmer.bases()); + final int prevSize = vertexSet().size(); + addVertex(newVertex); + + // make sure we aren't adding duplicates (would be a bug) + if ( vertexSet().size() != prevSize + 1) throw new IllegalStateException("Adding vertex " + newVertex + " to graph didn't increase the graph size"); + + // add the vertex to the unique kmer map, if it is in fact unique + if ( ! nonUniqueKmers.contains(kmer) && ! uniqueKmers.containsKey(kmer) ) // TODO -- not sure this last test is necessary + uniqueKmers.put(kmer, newVertex); + + return newVertex; + } + + /** + * Workhorse routine of the assembler. Given a sequence whose last vertex is anchored in the graph, extend + * the graph one bp according to the bases in sequence. + * + * @param prevVertex a non-null vertex where sequence was last anchored in the graph + * @param sequence the sequence we're threading through the graph + * @param kmerStart the start of the current kmer in graph we'd like to add + * @param count the number of observations of this kmer in graph (can be > 1 for reduced reads) + * @param isRef is this the reference sequence? + * @return a non-null vertex connecting prevVertex to in the graph based on sequence + */ + private MultiDeBruijnVertex extendChainByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final int count, final boolean isRef) { + final Set outgoingEdges = outgoingEdgesOf(prevVertex); + + final int nextPos = kmerStart + kmerSize - 1; + for ( final MultiSampleEdge outgoingEdge : outgoingEdges ) { + final MultiDeBruijnVertex target = getEdgeTarget(outgoingEdge); + if ( target.getSuffix() == sequence[nextPos] ) { + // we've got a match in the chain, so simply increase the count of the edge by 1 and continue + outgoingEdge.incMultiplicity(count); + return target; + } + } + + // none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in + final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize); + MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false); + + if ( isRef && uniqueMergeVertex != null ) + throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex); + + // either use our unique merge vertex, or create a new one in the chain + final MultiDeBruijnVertex nextVertex = uniqueMergeVertex == null ? createVertex(kmer) : uniqueMergeVertex; + addEdge(prevVertex, nextVertex, new MultiSampleEdge(isRef, count)); + return nextVertex; + } + + /** + * Get the start and stop positions (exclusive) of the longest stretch of high quality bases + * in read + * + * @param read a non-null read + * @return the start and stop for high quality bases in read, or null if none exist + */ + protected void addRead(final GATKSAMRecord read) { + final byte[] sequence = read.getReadBases(); + final byte[] qualities = read.getBaseQualities(); + final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced + + int lastGood = -1; // the index of the last good base we've seen + for( int end = 0; end <= sequence.length; end++ ) { + if ( end == sequence.length || qualities[end] < minBaseQualityToUseInAssembly ) { + // the first good base is at lastGood, can be -1 if last base was bad + final int start = lastGood; + // the stop base is end - 1 (if we're not at the end of the sequence) + final int stop = end == sequence.length ? sequence.length : end; + final int len = stop - start + 1; + + if ( start != -1 && len >= kmerSize ) { + // if the sequence is long enough to get some value out of, add it to the graph + final String name = read.getReadName() + "_" + start + "_" + end; + addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, stop, reducedReadCounts, false); + } + + lastGood = -1; // reset the last good base + } else if ( lastGood == -1 ) { + lastGood = end; // we're at a good base, the last good one is us + } + } + } + + /** + * Get the set of non-unique kmers in this graph. For debugging purposes + * @return a non-null set of kmers + */ + protected Set getNonUniqueKmers() { + return nonUniqueKmers; + } + + @Override + public String toString() { + return "ReadThreadingAssembler{" + + "kmerSize=" + kmerSize + + '}'; + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java new file mode 100644 index 000000000..a4bc0c1c8 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java @@ -0,0 +1,93 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +/** + * Keeps track of the information needed to add a sequence to the read threading assembly graph + * + * User: depristo + * Date: 4/18/13 + * Time: 8:59 AM + * To change this template use File | Settings | File Templates. + */ +final class SequenceForKmers { + final String name; + final byte[] sequence; + final int start, stop; + final private int[] counts; + final boolean isRef; + + /** + * Create a new sequence for creating kmers + */ + SequenceForKmers(final String name, byte[] sequence, int start, int stop, int[] counts, boolean ref) { + if ( start < 0 ) throw new IllegalArgumentException("Invalid start " + start); + if ( stop < start ) throw new IllegalArgumentException("Invalid stop " + stop); + if ( sequence == null ) throw new IllegalArgumentException("Sequence is null "); + if ( counts != null && counts.length != sequence.length ) throw new IllegalArgumentException("Sequence and counts don't have the same length " + sequence.length + " vs " + counts.length); + + this.name = name; + this.sequence = sequence; + this.start = start; + this.stop = stop; + this.isRef = ref; + this.counts = counts; + } + + /** + * Get the number of observations of the kmer starting at i in this sequence + * + * Can we > 1 because sequence may be a reduced read and therefore count as N observations + * + * @param i the offset into sequence for the start of the kmer + * @return a count >= 1 that indicates the number of observations of kmer starting at i in this sequence. + */ + public int getCount(final int i) { + if ( i < 0 || i > sequence.length ) throw new ArrayIndexOutOfBoundsException("i must be >= 0 and <= " + sequence.length + " but got " + i); + return counts == null ? 1 : counts[i]; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index e1559a13a..c5574577d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -80,59 +80,6 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation."); } - @Test(enabled = !DEBUG) - public void testLeftAlignCigarSequentially() { - String preRefString = "GATCGATCGATC"; - String postRefString = "TTT"; - String refString = "ATCGAGGAGAGCGCCCCG"; - String indelString1 = "X"; - String indelString2 = "YZ"; - int refIndel1 = 10; - int refIndel2 = 12; - - for ( final int indelSize1 : Arrays.asList(1, 2, 3, 4) ) { - for ( final int indelOp1 : Arrays.asList(1, -1) ) { - for ( final int indelSize2 : Arrays.asList(1, 2, 3, 4) ) { - for ( final int indelOp2 : Arrays.asList(1, -1) ) { - - Cigar expectedCigar = new Cigar(); - expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); - expectedCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); - expectedCigar.add(new CigarElement((indelOp1 < 0 ? refIndel1 - indelSize1 : refIndel1), CigarOperator.M)); - expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); - expectedCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); - expectedCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2 - indelSize2) * 2 : refIndel2 * 2), CigarOperator.M)); - expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); - - Cigar givenCigar = new Cigar(); - givenCigar.add(new CigarElement(refString.length() + refIndel1/2, CigarOperator.M)); - givenCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); - givenCigar.add(new CigarElement((indelOp1 < 0 ? (refIndel1/2 - indelSize1) : refIndel1/2) + refString.length() + refIndel2/2 * 2, CigarOperator.M)); - givenCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); - givenCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2/2 - indelSize2) * 2 : refIndel2/2 * 2) + refString.length(), CigarOperator.M)); - - String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString; - String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString; - - Cigar calculatedCigar = new DeBruijnAssembler().leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); - Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!"); - } - } - } - } - } - - @Test(enabled = true) - public void testLeftAlignCigarSequentiallyAdjacentID() { - final String ref = "GTCTCTCTCTCTCTCTCTATATATATATATATATTT"; - final String hap = "GTCTCTCTCTCTCTCTCTCTCTATATATATATATTT"; - final Cigar originalCigar = TextCigarCodec.getSingleton().decode("18M4I12M4D2M"); - - final Cigar result = new DeBruijnAssembler().leftAlignCigarSequentially(originalCigar, ref.getBytes(), hap.getBytes(), 0, 0); - logger.warn("Result is " + result); - Assert.assertEquals(originalCigar.getReferenceLength(), result.getReferenceLength(), "Reference lengths are different"); - } - private static class MockBuilder extends DeBruijnGraphBuilder { public final List addedPairs = new LinkedList(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 9d4c52798..d6c6a4f33 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "0bf5ae740bf9bd14c8d60d7849c45eb3"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fc11b553fbf16beac0da04a69f419365"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "7d2cc5c4ece386beedf6b07dfbe5bf26"); + "90cbcc7e959eb591fb7c5e12d65e0e40"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "a17856f709b546eaed486841d78248d2"); + "50894abb9d156bf480881cb5cb2a8a7d"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index d5e163a88..15516d090 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "2e10ab97afd4492c2a153b85871a2c2d"); + HCTest(CEUTRIO_BAM, "", "37e462379de17bc6c8aeeed6e9735dd3"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "affed81386dfe60e0b0d4e7e0525918f"); + HCTest(NA12878_BAM, "", "983a0d122714d4aa0ff7af20cc686703"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "e2d32d0dce2c5502a8e877f6bbb65a10"); + "dbbc884a975587d8e7255ce47b58f438"); } @Test @@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "125e91ebe43108b2b514c58a9b6d3a4f"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "ce602282e80cca6d4272f940e20e90c3"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "2d295ce36066d9d8d9ee9c67e6e2cbd1"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "09335c01d2e90714af7f4c91156da0b1"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -159,14 +159,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b34ddc93a7b9919e05da499508f44dd9")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("153d2251de7d22f423cd282b1505fbc0")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("98a78b9f58ab197b827ef2ce3ab043d3")); executeTest("HCTestStructuralIndels: ", spec); } @@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("0c29e4049908ec47a3159dce33d477c3")); + Arrays.asList("6e6ef6e0326bee6d20d9fd37349fdb8c")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("3306889b8d0735ce575bee281c1b8846")); + Arrays.asList("5e535983b2f7e5fb6c84fecffa092324")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java index c049121a3..9b08e8214 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java @@ -1,48 +1,48 @@ /* - * By downloading the PROGRAM you agree to the following terms of use: - * - * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY - * - * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). - * - * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and - * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. - * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: - * - * 1. DEFINITIONS - * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. - * - * 2. LICENSE - * 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. - * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. - * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. - * 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. - * - * 3. OWNERSHIP OF INTELLECTUAL PROPERTY - * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. - * Copyright 2012 Broad Institute, Inc. - * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. - * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. - * - * 4. INDEMNIFICATION - * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. - * - * 5. NO REPRESENTATIONS OR WARRANTIES - * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. - * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. - * - * 6. ASSIGNMENT - * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. - * - * 7. MISCELLANEOUS - * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. - * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. - * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. - * 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. - * 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. - * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. - * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. - */ +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; @@ -50,6 +50,9 @@ import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.Test; +import java.util.HashSet; +import java.util.Set; + public class KMerCounterCaseFixUnitTest extends BaseTest { @Test public void testMyData() { @@ -76,6 +79,18 @@ public class KMerCounterCaseFixUnitTest extends BaseTest { testCounting(counter, "NNC", 0); Assert.assertNotNull(counter.toString()); + + assertCounts(counter, 5); + assertCounts(counter, 4, "ATG"); + assertCounts(counter, 3, "ATG", "ACC"); + assertCounts(counter, 2, "ATG", "ACC", "AAA"); + assertCounts(counter, 1, "ATG", "ACC", "AAA", "CTG", "NNA", "CCC"); + } + + private void assertCounts(final KMerCounter counter, final int minCount, final String ... expecteds) { + final Set expected = new HashSet(); + for ( final String one : expecteds ) expected.add(new Kmer(one)); + Assert.assertEquals(new HashSet(counter.getKmersWithCountsAtLeast(minCount)), expected); } private void testCounting(final KMerCounter counter, final String in, final int expectedCount) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java new file mode 100644 index 000000000..a517e1cb1 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java @@ -0,0 +1,280 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +public class LocalAssemblyEngineUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + private IndexedFastaSequenceFile seq; + private SAMFileHeader header; + + @BeforeClass + public void setup() throws FileNotFoundException { + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(seq); + header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + } + + private enum Assembler {DEBRUIJN_ASSEMBLER, READ_THREADING_ASSEMBLER} + private LocalAssemblyEngine createAssembler(final Assembler type) { + switch ( type ) { + case DEBRUIJN_ASSEMBLER: return new DeBruijnAssembler(); + case READ_THREADING_ASSEMBLER: return new ReadThreadingAssembler(); + default: throw new IllegalStateException("Unexpected " + type); + } + } + + @DataProvider(name = "AssembleIntervalsData") + public Object[][] makeAssembleIntervalsData() { + List tests = new ArrayList(); + + final String contig = "20"; + final int start = 10000000; + final int end = 10100000; + final int windowSize = 100; + final int stepSize = 200; + final int nReadsToUse = 5; + + for ( final Assembler assembler : Assembler.values() ) { + for ( int startI = start; startI < end; startI += stepSize) { + final int endI = startI + windowSize; + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI); + tests.add(new Object[]{assembler, refLoc, nReadsToUse}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @DataProvider(name = "AssembleIntervalsWithVariantData") + public Object[][] makeAssembleIntervalsWithVariantData() { + List tests = new ArrayList(); + + final String contig = "20"; + final int start = 10000000; + final int end = 10001000; + final int windowSize = 100; + final int stepSize = 200; + final int variantStepSize = 1; + final int nReadsToUse = 5; + + for ( final Assembler assembler : Assembler.values() ) { + for ( int startI = start; startI < end; startI += stepSize) { + final int endI = startI + windowSize; + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI); + for ( int variantStart = windowSize / 2 - 10; variantStart < windowSize / 2 + 10; variantStart += variantStepSize ) { + tests.add(new Object[]{assembler, refLoc, nReadsToUse, variantStart}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AssembleIntervalsData") + public void testAssembleRef(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse) { + final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); + + final List reads = new LinkedList(); + for ( int i = 0; i < nReadsToUse; i++ ) { + final byte[] bases = refBases.clone(); + final byte[] quals = Utils.dupBytes((byte) 30, refBases.length); + final String cigar = refBases.length + "M"; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, loc.getContig(), loc.getContigIndex(), loc.getStart(), bases, quals, cigar); + reads.add(read); + } + + // TODO -- generalize to all assemblers + final Haplotype refHaplotype = new Haplotype(refBases, true); + final List haplotypes = assemble(assembler, refBases, loc, reads); + Assert.assertEquals(haplotypes, Collections.singletonList(refHaplotype)); + } + + @Test(dataProvider = "AssembleIntervalsWithVariantData") + public void testAssembleRefAndSNP(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) { + final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); + final Allele refBase = Allele.create(refBases[variantSite], true); + final Allele altBase = Allele.create((byte)(refBase.getBases()[0] == 'A' ? 'C' : 'A'), false); + final VariantContextBuilder vcb = new VariantContextBuilder("x", loc.getContig(), variantSite, variantSite, Arrays.asList(refBase, altBase)); + testAssemblyWithVariant(assembler, refBases, loc, nReadsToUse, vcb.make()); + } + + @Test(dataProvider = "AssembleIntervalsWithVariantData") + public void testAssembleRefAndDeletion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) { + final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); + for ( int deletionLength = 1; deletionLength < 10; deletionLength++ ) { + final Allele refBase = Allele.create(new String(refBases).substring(variantSite, variantSite + deletionLength + 1), true); + final Allele altBase = Allele.create(refBase.getBases()[0], false); + final VariantContextBuilder vcb = new VariantContextBuilder("x", loc.getContig(), variantSite, variantSite + deletionLength, Arrays.asList(refBase, altBase)); + testAssemblyWithVariant(assembler, refBases, loc, nReadsToUse, vcb.make()); + } + } + + @Test(dataProvider = "AssembleIntervalsWithVariantData") + public void testAssembleRefAndInsertion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) { + final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); + for ( int insertionLength = 1; insertionLength < 10; insertionLength++ ) { + final Allele refBase = Allele.create(refBases[variantSite], false); + final Allele altBase = Allele.create(new String(refBases).substring(variantSite, variantSite + insertionLength + 1), true); + final VariantContextBuilder vcb = new VariantContextBuilder("x", loc.getContig(), variantSite, variantSite + insertionLength, Arrays.asList(refBase, altBase)); + testAssemblyWithVariant(assembler, refBases, loc, nReadsToUse, vcb.make()); + } + } + + private void testAssemblyWithVariant(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final int nReadsToUse, final VariantContext site) { + final String preRef = new String(refBases).substring(0, site.getStart()); + final String postRef = new String(refBases).substring(site.getEnd() + 1, refBases.length); + final byte[] altBases = (preRef + site.getAlternateAllele(0).getBaseString() + postRef).getBytes(); + +// logger.warn("ref " + new String(refBases)); +// logger.warn("alt " + new String(altBases)); + + final List reads = new LinkedList(); + for ( int i = 0; i < nReadsToUse; i++ ) { + final byte[] bases = altBases.clone(); + final byte[] quals = Utils.dupBytes((byte) 30, altBases.length); + final String cigar = altBases.length + "M"; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, loc.getContig(), loc.getContigIndex(), loc.getStart(), bases, quals, cigar); + reads.add(read); + } + + final Haplotype refHaplotype = new Haplotype(refBases, true); + final Haplotype altHaplotype = new Haplotype(altBases, false); + final List haplotypes = assemble(assembler, refBases, loc, reads); + Assert.assertEquals(haplotypes, Arrays.asList(refHaplotype, altHaplotype)); + } + + + private List assemble(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final List reads) { + final Haplotype refHaplotype = new Haplotype(refBases, true); + final ActiveRegion activeRegion = new ActiveRegion(loc, null, true, genomeLocParser, 0); + activeRegion.addAll(reads); + final LocalAssemblyEngine engine = createAssembler(assembler); +// logger.warn("Assembling " + activeRegion + " with " + engine); + return engine.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.emptyList()); + } + + @DataProvider(name = "SimpleAssemblyTestData") + public Object[][] makeSimpleAssemblyTestData() { + List tests = new ArrayList(); + + final String contig = "20"; + final int start = 10000000; + final int windowSize = 200; + final int end = start + windowSize; + + final Map edgeExcludesByAssembler = new EnumMap<>(Assembler.class); + edgeExcludesByAssembler.put(Assembler.DEBRUIJN_ASSEMBLER, 26); + edgeExcludesByAssembler.put(Assembler.READ_THREADING_ASSEMBLER, 25); // TODO -- decrease to zero when the edge calling problem is fixed + + final String ref = new String(seq.getSubsequenceAt(contig, start, end).getBases()); + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, start, end); + + for ( final Assembler assembler : Assembler.values() ) { + final int excludeVariantsWithXbp = edgeExcludesByAssembler.get(assembler); + for ( int snpPos = 0; snpPos < windowSize; snpPos++) { + if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) { + final byte[] altBases = ref.getBytes(); + altBases[snpPos] = 'N'; + final String alt = new String(altBases); + tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SimpleAssemblyTestData") + public void testSimpleAssembly(final String name, final Assembler assembler, final GenomeLoc loc, final String ref, final String alt) { + final byte[] refBases = ref.getBytes(); + final byte[] altBases = alt.getBytes(); + + final List reads = new LinkedList<>(); + for ( int i = 0; i < 20; i++ ) { + final byte[] bases = altBases.clone(); + final byte[] quals = Utils.dupBytes((byte) 30, altBases.length); + final String cigar = altBases.length + "M"; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, loc.getContig(), loc.getContigIndex(), loc.getStart(), bases, quals, cigar); + reads.add(read); + } + + final Haplotype refHaplotype = new Haplotype(refBases, true); + final Haplotype altHaplotype = new Haplotype(altBases, false); + final List haplotypes = assemble(assembler, refBases, loc, reads); + Assert.assertTrue(haplotypes.size() > 0, "Failed to find ref haplotype"); + Assert.assertEquals(haplotypes.get(0), refHaplotype); + + Assert.assertEquals(haplotypes.size(), 2, "Failed to find single alt haplotype"); + Assert.assertEquals(haplotypes.get(1), altHaplotype); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java index 7df6ee6c8..ea1d120b6 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java @@ -83,7 +83,10 @@ public class BaseEdgeUnitTest extends BaseTest { e.setMultiplicity(mult + 1); Assert.assertEquals(e.getMultiplicity(), mult + 1); - final BaseEdge copy = new BaseEdge(e); + e.incMultiplicity(2); + Assert.assertEquals(e.getMultiplicity(), mult + 3); + + final BaseEdge copy = e.copy(); Assert.assertEquals(copy.isRef(), e.isRef()); Assert.assertEquals(copy.getMultiplicity(), e.getMultiplicity()); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java index c829488ba..e57f5d6e0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java @@ -49,8 +49,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import scala.actors.threadpool.Arrays; import java.io.File; import java.util.*; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java index 8682ae5e4..cfed2f0b8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -137,12 +137,12 @@ public class CommonSuffixMergerUnitTest extends BaseTest { public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) { try { final Set haplotypes = new HashSet(); - final List> originalPaths = new KBestPaths().getKBestPaths(original); - for ( final Path path : originalPaths ) + final List> originalPaths = new KBestPaths().getKBestPaths(original); + for ( final Path path : originalPaths ) haplotypes.add(new String(path.getBases())); - final List> splitPaths = new KBestPaths().getKBestPaths(actual); - for ( final Path path : splitPaths ) { + final List> splitPaths = new KBestPaths().getKBestPaths(actual); + for ( final Path path : splitPaths ) { final String h = new String(path.getBases()); Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java index 1ed20e5f4..9703d76cb 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java @@ -154,16 +154,16 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { original.addEdge(v3, v4, new BaseEdge(false, 34)); original.addEdge(v4, v2, new BaseEdge(false, 42)); - original.printGraph(new File("testSplitInfiniteCycleFailure.dot"), 0); +// original.printGraph(new File("testSplitInfiniteCycleFailure.dot"), 0); final SeqGraph graph = (SeqGraph)original.clone(); final boolean success = new CommonSuffixSplitter().split(graph, v2); Assert.assertTrue(success); for ( final SeqVertex v : graph.vertexSet() ) { - graph.printGraph(new File("testSplitInfiniteCycleFailure.first_split.dot"), 0); +// graph.printGraph(new File("testSplitInfiniteCycleFailure.first_split.dot"), 0); final boolean success2 = new CommonSuffixSplitter().split((SeqGraph)graph.clone(), v); - if ( success2 ) graph.printGraph(new File("testSplitInfiniteCycleFailure.fail.dot"), 0); +// if ( success2 ) graph.printGraph(new File("testSplitInfiniteCycleFailure.fail.dot"), 0); Assert.assertFalse(success2, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java new file mode 100644 index 000000000..01a6b5dbb --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java @@ -0,0 +1,120 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class GraphUtilsUnitTest extends BaseTest { + @DataProvider(name = "findLongestUniqueMatchData") + public Object[][] makefindLongestUniqueMatchData() { + List tests = new ArrayList(); + + { // test all edge conditions + final String ref = "ACGT"; + for ( int start = 0; start < ref.length(); start++ ) { + for ( int end = start + 1; end <= ref.length(); end++ ) { + final String kmer = ref.substring(start, end); + tests.add(new Object[]{ref, kmer, end - 1, end - start}); + tests.add(new Object[]{ref, "N" + kmer, end - 1, end - start}); + tests.add(new Object[]{ref, "NN" + kmer, end - 1, end - start}); + tests.add(new Object[]{ref, kmer + "N", -1, 0}); + tests.add(new Object[]{ref, kmer + "NN", -1, 0}); + } + } + } + + { // multiple matches + final String ref = "AACCGGTT"; + for ( final String alt : Arrays.asList("A", "C", "G", "T") ) + tests.add(new Object[]{ref, alt, -1, 0}); + tests.add(new Object[]{ref, "AA", 1, 2}); + tests.add(new Object[]{ref, "CC", 3, 2}); + tests.add(new Object[]{ref, "GG", 5, 2}); + tests.add(new Object[]{ref, "TT", 7, 2}); + } + + { // complex matches that have unique substrings of lots of parts of kmer in the ref + final String ref = "ACGTACGTACGT"; + tests.add(new Object[]{ref, "ACGT", -1, 0}); + tests.add(new Object[]{ref, "TACGT", -1, 0}); + tests.add(new Object[]{ref, "GTACGT", -1, 0}); + tests.add(new Object[]{ref, "CGTACGT", -1, 0}); + tests.add(new Object[]{ref, "ACGTACGT", -1, 0}); + tests.add(new Object[]{ref, "TACGTACGT", 11, 9}); + tests.add(new Object[]{ref, "NTACGTACGT", 11, 9}); + tests.add(new Object[]{ref, "GTACGTACGT", 11, 10}); + tests.add(new Object[]{ref, "NGTACGTACGT", 11, 10}); + tests.add(new Object[]{ref, "CGTACGTACGT", 11, 11}); + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "findLongestUniqueMatchData") + public void testfindLongestUniqueMatch(final String seq, final String kmer, final int start, final int length) { + // adaptor this code to do whatever testing you want given the arguments start and size + final PrimitivePair.Int actual = GraphUtils.findLongestUniqueSuffixMatch(seq.getBytes(), kmer.getBytes()); + if ( start == -1 ) + Assert.assertNull(actual); + else { + Assert.assertNotNull(actual); + Assert.assertEquals(actual.first, start); + Assert.assertEquals(actual.second, length); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index d1bae74b2..d6709672a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -114,7 +114,7 @@ public class KBestPathsUnitTest extends BaseTest { if ( addCycle ) graph.addEdge(middleBottom, middleBottom); // enumerate all possible paths - final List> paths = new KBestPaths(allowCycles).getKBestPaths(graph, starts, ends); + final List> paths = new KBestPaths(allowCycles).getKBestPaths(graph, starts, ends); final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * (addCycle && allowCycles ? 2 : 1) * nEndNodes; Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths"); @@ -127,7 +127,7 @@ public class KBestPathsUnitTest extends BaseTest { // get the best path, and make sure it's the same as our optimal path overall final Path best = paths.get(0); - final List> justOne = new KBestPaths(allowCycles).getKBestPaths(graph, 1, starts, ends); + final List> justOne = new KBestPaths(allowCycles).getKBestPaths(graph, 1, starts, ends); Assert.assertEquals(justOne.size(), 1); Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); } @@ -147,7 +147,7 @@ public class KBestPathsUnitTest extends BaseTest { graph.addEdges(v4, v2); // enumerate all possible paths - final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v5); + final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v5); Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); } @@ -163,7 +163,7 @@ public class KBestPathsUnitTest extends BaseTest { graph.addEdges(v1, v2, v3, v3); // enumerate all possible paths - final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v3); + final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v3); Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); } @@ -201,9 +201,9 @@ public class KBestPathsUnitTest extends BaseTest { graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); // Construct the test path - Path path = new Path(v, graph); - path = new Path(path, graph.getEdge(v, v2Alt)); - path = new Path(path, graph.getEdge(v2Alt, v3)); + Path path = new Path(v, graph); + path = new Path(path, graph.getEdge(v, v2Alt)); + path = new Path(path, graph.getEdge(v2Alt, v3)); // Construct the actual cigar string implied by the test path Cigar expectedCigar = new Cigar(); @@ -219,7 +219,8 @@ public class KBestPathsUnitTest extends BaseTest { } expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); - Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); + final String ref = preRef + v2Ref.getSequenceString() + postRef; + Assert.assertEquals(path.calculateCigar(ref.getBytes()).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); } @DataProvider(name = "GetBasesData") @@ -251,9 +252,9 @@ public class KBestPathsUnitTest extends BaseTest { } // enumerate all possible paths - final List> paths = new KBestPaths().getKBestPaths(graph); + final List> paths = new KBestPaths().getKBestPaths(graph); Assert.assertEquals(paths.size(), 1); - final Path path = paths.get(0); + final Path path = paths.get(0); Assert.assertEquals(new String(path.getBases()), Utils.join("", frags), "Path doesn't have the expected sequence"); } @@ -296,6 +297,8 @@ public class KBestPathsUnitTest extends BaseTest { SeqVertex v7 = new SeqVertex(postRef); SeqVertex postV = new SeqVertex(postAltOption); + final String ref = preRef + v2Ref.getSequenceString() + midRef1 + v4Ref.getSequenceString() + midRef2 + v6Ref.getSequenceString() + postRef; + graph.addVertex(preV); graph.addVertex(v); graph.addVertex(v2Ref); @@ -324,18 +327,18 @@ public class KBestPathsUnitTest extends BaseTest { graph.addEdge(v7, postV, new BaseEdge(false, 1)); // Construct the test path - Path path = new Path( (offRefBeginning ? preV : v), graph); + Path path = new Path( (offRefBeginning ? preV : v), graph); if( offRefBeginning ) { - path = new Path(path, graph.getEdge(preV, v)); + path = new Path(path, graph.getEdge(preV, v)); } - path = new Path(path, graph.getEdge(v, v2Alt)); - path = new Path(path, graph.getEdge(v2Alt, v3)); - path = new Path(path, graph.getEdge(v3, v4Ref)); - path = new Path(path, graph.getEdge(v4Ref, v5)); - path = new Path(path, graph.getEdge(v5, v6Alt)); - path = new Path(path, graph.getEdge(v6Alt, v7)); + path = new Path(path, graph.getEdge(v, v2Alt)); + path = new Path(path, graph.getEdge(v2Alt, v3)); + path = new Path(path, graph.getEdge(v3, v4Ref)); + path = new Path(path, graph.getEdge(v4Ref, v5)); + path = new Path(path, graph.getEdge(v5, v6Alt)); + path = new Path(path, graph.getEdge(v6Alt, v7)); if( offRefEnding ) { - path = new Path(path, graph.getEdge(v7,postV)); + path = new Path(path, graph.getEdge(v7,postV)); } // Construct the actual cigar string implied by the test path @@ -373,7 +376,9 @@ public class KBestPathsUnitTest extends BaseTest { expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I)); } - Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); + Assert.assertEquals(path.calculateCigar(ref.getBytes()).toString(), + AlignmentUtils.consolidateCigar(expectedCigar).toString(), + "Cigar string mismatch: ref = " + ref + " alt " + new String(path.getBases())); } @Test(enabled = !DEBUG) @@ -389,43 +394,46 @@ public class KBestPathsUnitTest extends BaseTest { graph.addEdges(new BaseEdge(true, 1), top, ref, bot); graph.addEdges(new BaseEdge(false, 1), top, alt, bot); - final KBestPaths pathFinder = new KBestPaths(); - final List> paths = pathFinder.getKBestPaths(graph, top, bot); + final KBestPaths pathFinder = new KBestPaths(); + final List> paths = pathFinder.getKBestPaths(graph, top, bot); Assert.assertEquals(paths.size(), 2); - final Path refPath = paths.get(0); - final Path altPath = paths.get(1); + final Path refPath = paths.get(0); + final Path altPath = paths.get(1); - Assert.assertEquals(refPath.calculateCigar().toString(), "10M"); - Assert.assertEquals(altPath.calculateCigar().toString(), "1M3I5M3D1M"); + final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); + Assert.assertEquals(refPath.calculateCigar(refString.getBytes()).toString(), "10M"); + Assert.assertEquals(altPath.calculateCigar(refString.getBytes()).toString(), "1M3I5M3D1M"); } @Test(enabled = !DEBUG) public void testHardSWPath() { // Construct the assembly graph SeqGraph graph = new SeqGraph(); - final SeqVertex top = new SeqVertex( "NNN"); - final SeqVertex bot = new SeqVertex( "NNN"); + final SeqVertex top = new SeqVertex( "NNN" ); + final SeqVertex bot = new SeqVertex( "NNN" ); final SeqVertex alt = new SeqVertex( "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); final SeqVertex ref = new SeqVertex( "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); graph.addVertices(top, bot, alt, ref); graph.addEdges(new BaseEdge(true, 1), top, ref, bot); graph.addEdges(new BaseEdge(false, 1), top, alt, bot); - final KBestPaths pathFinder = new KBestPaths(); - final List> paths = pathFinder.getKBestPaths(graph, top, bot); + final KBestPaths pathFinder = new KBestPaths(); + final List> paths = pathFinder.getKBestPaths(graph, top, bot); Assert.assertEquals(paths.size(), 2); - final Path refPath = paths.get(0); - final Path altPath = paths.get(1); + final Path refPath = paths.get(0); + final Path altPath = paths.get(1); - logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar()); - logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar()); + final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); - Assert.assertEquals(refPath.calculateCigar().toString(), "51M"); - Assert.assertEquals(altPath.calculateCigar().toString(), "3M6I48M"); + logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar(refString.getBytes())); + logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar(refString.getBytes())); + + Assert.assertEquals(refPath.calculateCigar(refString.getBytes()).toString(), "51M"); + Assert.assertEquals(altPath.calculateCigar(refString.getBytes()).toString(), "3M6I48M"); } // ----------------------------------------------------------------- @@ -466,30 +474,87 @@ public class KBestPathsUnitTest extends BaseTest { // Construct the assembly graph SeqGraph graph = new SeqGraph(); - SeqVertex top = new SeqVertex(""); + final int padSize = 0; + SeqVertex top = new SeqVertex(Utils.dupString("N", padSize)); SeqVertex ref = new SeqVertex(prefix + refMid + end); SeqVertex alt = new SeqVertex(prefix + altMid + end); - SeqVertex bot = new SeqVertex(""); + SeqVertex bot = new SeqVertex(Utils.dupString("N", padSize)); graph.addVertices(top, ref, alt, bot); graph.addEdges(new BaseEdge(true, 1), top, ref, bot); graph.addEdges(new BaseEdge(false, 1), top, alt, bot); // Construct the test path - Path path = Path.makePath(Arrays.asList(top, alt, bot), graph); + Path path = Path.makePath(Arrays.asList(top, alt, bot), graph); Cigar expected = new Cigar(); + expected.add(new CigarElement(padSize, CigarOperator.M)); if ( ! prefix.equals("") ) expected.add(new CigarElement(prefix.length(), CigarOperator.M)); for ( final CigarElement elt : TextCigarCodec.getSingleton().decode(midCigar).getCigarElements() ) expected.add(elt); if ( ! end.equals("") ) expected.add(new CigarElement(end.length(), CigarOperator.M)); + expected.add(new CigarElement(padSize, CigarOperator.M)); expected = AlignmentUtils.consolidateCigar(expected); - final Cigar pathCigar = path.calculateCigar(); + final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); + final Cigar pathCigar = path.calculateCigar(refString.getBytes()); logger.warn("diffs: " + ref + " vs. " + alt + " cigar " + midCigar); logger.warn("Path " + path + " with cigar " + pathCigar); logger.warn("Expected cigar " + expected); - Assert.assertEquals(pathCigar, expected, "Cigar mismatch"); + Assert.assertEquals(pathCigar, expected, "Cigar mismatch: ref = " + refString + " vs alt = " + new String(path.getBases())); + } + + @Test(enabled = !DEBUG) + public void testLeftAlignCigarSequentially() { + String preRefString = "GATCGATCGATC"; + String postRefString = "TTT"; + String refString = "ATCGAGGAGAGCGCCCCG"; + String indelString1 = "X"; + String indelString2 = "YZ"; + int refIndel1 = 10; + int refIndel2 = 12; + + for ( final int indelSize1 : Arrays.asList(1, 2, 3, 4) ) { + for ( final int indelOp1 : Arrays.asList(1, -1) ) { + for ( final int indelSize2 : Arrays.asList(1, 2, 3, 4) ) { + for ( final int indelOp2 : Arrays.asList(1, -1) ) { + + Cigar expectedCigar = new Cigar(); + expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); + expectedCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); + expectedCigar.add(new CigarElement((indelOp1 < 0 ? refIndel1 - indelSize1 : refIndel1), CigarOperator.M)); + expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); + expectedCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); + expectedCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2 - indelSize2) * 2 : refIndel2 * 2), CigarOperator.M)); + expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M)); + + Cigar givenCigar = new Cigar(); + givenCigar.add(new CigarElement(refString.length() + refIndel1/2, CigarOperator.M)); + givenCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D))); + givenCigar.add(new CigarElement((indelOp1 < 0 ? (refIndel1/2 - indelSize1) : refIndel1/2) + refString.length() + refIndel2/2 * 2, CigarOperator.M)); + givenCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D))); + givenCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2/2 - indelSize2) * 2 : refIndel2/2 * 2) + refString.length(), CigarOperator.M)); + + String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString; + String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString; + + Cigar calculatedCigar = Path.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); + Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!"); + } + } + } + } + } + + @Test(enabled = true) + public void testLeftAlignCigarSequentiallyAdjacentID() { + final String ref = "GTCTCTCTCTCTCTCTCTATATATATATATATATTT"; + final String hap = "GTCTCTCTCTCTCTCTCTCTCTATATATATATATTT"; + final Cigar originalCigar = TextCigarCodec.getSingleton().decode("18M4I12M4D2M"); + + final Cigar result = Path.leftAlignCigarSequentially(originalCigar, ref.getBytes(), hap.getBytes(), 0, 0); + logger.warn("Result is " + result); + Assert.assertEquals(originalCigar.getReferenceLength(), result.getReferenceLength(), "Reference lengths are different"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java new file mode 100644 index 000000000..06d81499c --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java @@ -0,0 +1,163 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class LowWeightChainPrunerUnitTest extends BaseTest { + @DataProvider(name = "pruneChainsData") + public Object[][] makePruneChainsData() { + List tests = new ArrayList<>(); + + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("C"); + final SeqVertex v3 = new SeqVertex("G"); + final SeqVertex v4 = new SeqVertex("T"); + final SeqVertex v5 = new SeqVertex("AA"); + final SeqVertex v6 = new SeqVertex("CC"); + + for ( final int edgeWeight : Arrays.asList(1, 2, 3) ) { + for ( final int pruneFactor : Arrays.asList(1, 2, 3, 4) ) { + for ( final boolean isRef : Arrays.asList(true, false)) { + { // just an isolated chain + final int nExpected = edgeWeight < pruneFactor && ! isRef ? 3 : 0; + SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3); + graph.addEdges(new BaseEdge(isRef, edgeWeight), v1, v2, v3); + tests.add(new Object[]{"combinatorial", graph, pruneFactor, nExpected > 0 ? Collections.emptySet() : graph.vertexSet()}); + } + } + } + } + + { // connects to ref chain + SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3); + graph.addVertices(v4, v5); + graph.addEdges(new BaseEdge(true, 1), v4, v5); + graph.addEdges(new BaseEdge(false, 1), v4, v1, v2, v3, v5); + tests.add(new Object[]{"bad internal branch", graph, 2, new HashSet<>(Arrays.asList(v4, v5))}); + } + + { // has bad cycle + SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4); + graph.addEdges(new BaseEdge(false, 1), v4, v1, v2, v3, v1); + // note that we'll remove v4 because it's low weight + tests.add(new Object[]{"has bad cycle", graph, 2, Collections.emptySet()}); + } + + { // has good cycle + SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4); + graph.addEdges(new BaseEdge(false, 3), v4, v1, v2, v3, v1); + // note that we'll remove v4 because it's low weight + tests.add(new Object[]{"has good cycle", graph, 2, graph.vertexSet()}); + } + + { // has branch + SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4, v5, v6); + graph.addEdges(new BaseEdge(false, 1), v1, v2, v3, v4, v6); + graph.addEdges(new BaseEdge(false, 1), v1, v2, v3, v5, v6); + tests.add(new Object[]{"has two bad branches", graph, 2, Collections.emptySet()}); + } + + { // middle vertex above threshold => no one can be removed + SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4, v5); + graph.addEdges(new BaseEdge(false, 1), v1, v2); + graph.addEdges(new BaseEdge(false, 3), v2, v3); + graph.addEdges(new BaseEdge(false, 1), v3, v4, v5); + tests.add(new Object[]{"middle vertex above factor", graph, 2, graph.vertexSet()}); + } + + { // the branching node has value > pruneFactor + SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4, v5, v6); + graph.addEdges(new BaseEdge(false, 3), v1, v2); + graph.addEdges(new BaseEdge(false, 3), v2, v3); + graph.addEdges(new BaseEdge(false, 1), v3, v4, v6); + graph.addEdges(new BaseEdge(false, 3), v2, v5, v6); + tests.add(new Object[]{"branch node greater than pruneFactor", graph, 2, graph.vertexSet()}); + } + + { // A single isolated chain with weights all below pruning should be pruned + SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4, v5); + graph.addEdges(new BaseEdge(false, 1), v1, v2, v3); + graph.addEdges(new BaseEdge(false, 5), v4, v5); + tests.add(new Object[]{"isolated chain", graph, 2, new LinkedHashSet<>(Arrays.asList(v4, v5))}); + } + + { // A chain with weights all below pruning should be pruned, even if it connects to another good chain + SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4, v5, v6); + graph.addEdges(new BaseEdge(false, 1), v1, v2, v3, v5); + graph.addEdges(new BaseEdge(false, 5), v4, v5, v6); + tests.add(new Object[]{"bad chain branching into good one", graph, 2, new HashSet<>(Arrays.asList(v4, v5, v6))}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "pruneChainsData", enabled = true) + public void testPruneChains(final String name, final SeqGraph graph, final int pruneFactor, final Set remainingVertices) { + final Set copy = new HashSet<>(remainingVertices); +// graph.printGraph(new File("in.dot"), 0); + final LowWeightChainPruner pruner = new LowWeightChainPruner<>(pruneFactor); + pruner.pruneLowWeightChains(graph); +// graph.printGraph(new File("out.dot"), 0); + Assert.assertEquals(graph.vertexSet(), copy); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java new file mode 100644 index 000000000..f11be6635 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java @@ -0,0 +1,103 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class MultiSampleEdgeUnitTest extends BaseTest { + @DataProvider(name = "MultiplicityData") + public Object[][] makeMultiplicityData() { + List tests = new ArrayList(); + + final List countsPerSample = Arrays.asList(0, 1, 2, 3, 4, 5); + for ( final int nSamples : Arrays.asList(1, 2, 3, 4, 5)) { + for ( final List perm : Utils.makePermutations(countsPerSample, nSamples, false) ) { + tests.add(new Object[]{perm}); + } + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MultiplicityData") + public void testMultiplicity(final List countsPerSample) { + final MultiSampleEdge edge = new MultiSampleEdge(false, 0); + Assert.assertEquals(edge.getMultiplicity(), 0); + Assert.assertEquals(edge.getPruningMultiplicity(), 0); + + int total = 0; + for ( int i = 0; i < countsPerSample.size(); i++ ) { + int countForSample = 0; + for ( int count = 0; count < countsPerSample.get(i); count++ ) { + edge.incMultiplicity(1); + total++; + countForSample++; + Assert.assertEquals(edge.getMultiplicity(), total); + Assert.assertEquals(edge.getCurrentSingleSampleMultiplicity(), countForSample); + } + edge.flushSingleSampleMultiplicity(); + } + + final int max = MathUtils.arrayMax(ArrayUtils.toPrimitive(countsPerSample.toArray(new Integer[countsPerSample.size()]))); + Assert.assertEquals(edge.getMultiplicity(), total); + Assert.assertEquals(edge.getPruningMultiplicity(), max); + Assert.assertEquals(edge.getMaxSingleSampleMultiplicity(), max); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java new file mode 100644 index 000000000..ee07bea33 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java @@ -0,0 +1,80 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import net.sf.samtools.Cigar; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class PathUnitTest extends BaseTest { + @Test(enabled = true) + public void testAlignReallyLongDeletion() { + final String ref = "ATGGTGGCTCATACCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGAACATCACCTGAGGCCAGGAGTTCAAAACCAGCCTGGCTAACATAGCAAAACCCCATCTCTAATGAAAATACAAAAATTAGCTGGGTGTGGTGGTGTCCGCCTGTAGTCCCAGCTACTCAGGAGACTAAGGCATGAGAATCACTTGAACCCAGGATGCAGAGGCTGTAGTGAGCCGAGATTGCACCACGGCTGCACTCCAGCCTGGGCAACAGAGCGAGACTCTGTCTCAAATAAAATAGCGTAACGTAACATAACATAACATAACATAACATAACATAACATAACATAACATAACATAACATAACACAACAACAAAATAAAATAACATAAATCATGTTGTTAGGAAAAAAATCAGTTATGCAGCTACATGCTATTTACAAGAGATATACCTTAAAATATAAGACACAGAGGCCGGGCGCGGTAGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAAGCGGATCATGAGGTCAGGAGATCGAGACCATCC"; + final String hap = "ATGGTGGCTCATACCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAAGCGGATCATGAGGTCAGGAGATCGAGACCATCCT"; + + final SeqGraph graph = new SeqGraph(); + final SeqVertex v = new SeqVertex(hap); + graph.addVertex(v); + final Path path = new Path(v, graph); + final Cigar cigar = path.calculateCigar(ref.getBytes()); + Assert.assertNull(cigar, "Should have failed gracefully"); + } + + @Test(enabled = true) + public void testAlignReallyLongDeletion2() { + final String ref = "CGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTGACCTCAGGTGATCCACTCGCCTCGGTCTCCCAAAGTGTTGGGATTACAGGCATGAACCACTGCACCTGGCCTAGTGTTTGGGAAAACTATACTAGGAAAAGAATAGTTGCTTTAAGTCATTCTTTGATTATTCTGAGAATTGGCATATAGCTGCCATTATAACCTACTTTTGCTAAATATAATAATAATAATCATTATTTTTATTTTTTGAGACAGGGTCTTGTTTTGTCACCCCGGCTGGAGTGAAGTGGCGCAATCTCGGCTCACTGCAACCTCCACCTCCGGGTGCAAGCAATTCTCCTGCCTCAGCCTCTTGAGTAGCTAGGATTACAGGCACAAGCCATCATGCCCAGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAGGT"; + final String hap = "CGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAGGT"; + + final SeqGraph graph = new SeqGraph(); + final SeqVertex v = new SeqVertex(hap); + graph.addVertex(v); + final Path path = new Path(v, graph); + final Cigar cigar = path.calculateCigar(ref.getBytes()); + Assert.assertEquals(cigar.toString(), "48M419D30M"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index bd2e3cc2c..c72f426be 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -280,16 +280,15 @@ public class SeqGraphUnitTest extends BaseTest { all.addEdges(pre2, top, middle2, bottom, tail2); final SeqGraph expected = new SeqGraph(); + SeqVertex newPre1 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "C"); + SeqVertex newPre2 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "G"); + final SeqVertex newTop = new SeqVertex("TA"); final SeqVertex newMiddle1 = new SeqVertex("G"); final SeqVertex newMiddle2 = new SeqVertex("T"); final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString()); - final SeqVertex newTop = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES)); - final SeqVertex newTopDown1 = new SeqVertex("G"); - final SeqVertex newTopDown2 = new SeqVertex("C"); - final SeqVertex newTopBottomMerged = new SeqVertex("TA"); - expected.addVertices(newTop, newTopDown1, newTopDown2, newTopBottomMerged, newMiddle1, newMiddle2, newBottom, tail1, tail2); - expected.addEdges(newTop, newTopDown1, newTopBottomMerged, newMiddle1, newBottom, tail1); - expected.addEdges(newTop, newTopDown2, newTopBottomMerged, newMiddle2, newBottom, tail2); + expected.addVertices(newPre1, newPre2, newTop, newMiddle1, newMiddle2, newBottom, tail1, tail2); + expected.addEdges(newPre1, newTop, newMiddle1, newBottom, tail1); + expected.addEdges(newPre2, newTop, newMiddle2, newBottom, tail2); tests.add(new Object[]{all.clone(), expected.clone()}); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index 2df783b19..5bc13f884 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -227,8 +227,8 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { } final Set haplotypes = new HashSet(); - final List> originalPaths = new KBestPaths().getKBestPaths((SeqGraph)graph.clone()); - for ( final Path path : originalPaths ) + final List> originalPaths = new KBestPaths().getKBestPaths((SeqGraph)graph.clone()); + for ( final Path path : originalPaths ) haplotypes.add(new String(path.getBases())); final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); @@ -238,8 +238,8 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { splitter.updateGraph(top, bot); if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0); - final List> splitPaths = new KBestPaths().getKBestPaths(graph); - for ( final Path path : splitPaths ) { + final List> splitPaths = new KBestPaths().getKBestPaths(graph); + for ( final Path path : splitPaths ) { final String h = new String(path.getBases()); Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java new file mode 100644 index 000000000..8efb3d486 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java @@ -0,0 +1,213 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class ReadThreadingAssemblerUnitTest extends BaseTest { + private final static boolean DEBUG = false; + + private static class TestAssembler { + final ReadThreadingAssembler assembler; + + Haplotype refHaplotype; + final List reads = new LinkedList(); + + private TestAssembler(final int kmerSize) { + this.assembler = new ReadThreadingAssembler(100000, Arrays.asList(kmerSize)); + assembler.setJustReturnRawGraph(true); + assembler.setPruneFactor(0); + } + + public void addSequence(final byte[] bases, final boolean isRef) { + if ( isRef ) { + refHaplotype = new Haplotype(bases, true); + } else { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte)30,bases.length), bases.length + "M"); + reads.add(read); + } + } + + public SeqGraph assemble() { + assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests + assembler.setDebugGraphTransformations(true); + final SeqGraph graph = assembler.assemble(reads, refHaplotype).get(0); + if ( DEBUG ) graph.printGraph(new File("test.dot"), 0); + return graph; + } + } + + private void assertLinearGraph(final TestAssembler assembler, final String seq) { + final SeqGraph graph = assembler.assemble(); + graph.simplifyGraph(); + Assert.assertEquals(graph.vertexSet().size(), 1); + Assert.assertEquals(graph.vertexSet().iterator().next().getSequenceString(), seq); + } + + private void assertSingleBubble(final TestAssembler assembler, final String one, final String two) { + final SeqGraph graph = assembler.assemble(); + graph.simplifyGraph(); + List> paths = new KBestPaths().getKBestPaths(graph); + Assert.assertEquals(paths.size(), 2); + final Set expected = new HashSet(Arrays.asList(one, two)); + for ( final Path path : paths ) { + final String seq = new String(path.getBases()); + Assert.assertTrue(expected.contains(seq)); + expected.remove(seq); + } + } + + @Test(enabled = ! DEBUG) + public void testRefCreation() { + final String ref = "ACGTAACCGGTT"; + final TestAssembler assembler = new TestAssembler(3); + assembler.addSequence(ref.getBytes(), true); + assertLinearGraph(assembler, ref); + } + + @Test(enabled = ! DEBUG) + public void testRefNonUniqueCreation() { + final String ref = "GAAAAT"; + final TestAssembler assembler = new TestAssembler(3); + assembler.addSequence(ref.getBytes(), true); + assertLinearGraph(assembler, ref); + } + + @Test(enabled = ! DEBUG) + public void testRefAltCreation() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "ACAACTGA"; + final String alt = "ACAGCTGA"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(alt.getBytes(), false); + assertSingleBubble(assembler, ref, alt); + } + + @Test(enabled = ! DEBUG) + public void testPartialReadsCreation() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "ACAACTGA"; + final String alt1 = "ACAGCT"; + final String alt2 = "GCTGA"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(alt1.getBytes(), false); + assembler.addSequence(alt2.getBytes(), false); + assertSingleBubble(assembler, ref, "ACAGCTGA"); + } + + @Test(enabled = ! DEBUG) + public void testStartInMiddle() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "CAAAATG"; + final String read = "AAATG"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(read.getBytes(), false); + assertLinearGraph(assembler, ref); + } + + @Test(enabled = ! DEBUG) + public void testStartInMiddleWithBubble() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "CAAAATGGGG"; + final String read = "AAATCGGG"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(read.getBytes(), false); + assertSingleBubble(assembler, ref, "CAAAATCGGG"); + } + + @Test(enabled = ! DEBUG) + public void testNoGoodStarts() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "CAAAATGGGG"; + final String read = "AAATCGGG"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(read.getBytes(), false); + assertSingleBubble(assembler, ref, "CAAAATCGGG"); + } + + + @Test(enabled = !DEBUG) + public void testCreateWithBasesBeforeRefSource() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "ACTG"; + final String read = "CTGGGACT"; + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(ref), true); + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read), false); + assertLinearGraph(assembler, "ACTGGGACT"); + } + + @Test(enabled = !DEBUG) + public void testSingleIndelAsDoubleIndel3Reads() { + final TestAssembler assembler = new TestAssembler(25); + // The single indel spans two repetitive structures + final String ref = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCTCTCTGTGTGTGTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG"; + final String read1 = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCT----------GTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG"; + final String read2 = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCT----------GTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG"; + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(ref), true); + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read1), false); + assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read2), false); + + final SeqGraph graph = assembler.assemble(); + final KBestPaths pathFinder = new KBestPaths(); + final List> paths = pathFinder.getKBestPaths(graph); + Assert.assertEquals(paths.size(), 2); + final byte[] refPath = paths.get(0).getBases().length == ref.length() ? paths.get(0).getBases() : paths.get(1).getBases(); + final byte[] altPath = paths.get(0).getBases().length == ref.length() ? paths.get(1).getBases() : paths.get(0).getBases(); + Assert.assertEquals(refPath, ReadThreadingGraphUnitTest.getBytes(ref)); + Assert.assertEquals(altPath, ReadThreadingGraphUnitTest.getBytes(read1)); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java new file mode 100644 index 000000000..10c1cc00d --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -0,0 +1,191 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class ReadThreadingGraphUnitTest extends BaseTest { + private final static boolean DEBUG = false; + + public static byte[] getBytes(final String alignment) { + return alignment.replace("-","").getBytes(); + } + + private void assertNonUniques(final ReadThreadingGraph assembler, String ... nonUniques) { + final Set actual = new HashSet<>(); + assembler.buildGraphIfNecessary(); + for ( final Kmer kmer : assembler.getNonUniqueKmers() ) actual.add(kmer.baseString()); + final Set expected = new HashSet<>(Arrays.asList(nonUniques)); + Assert.assertEquals(actual, expected); + } + + @Test(enabled = ! DEBUG) + public void testNonUniqueMiddle() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(3); + final String ref = "GACACACAGTCA"; + final String read1 = "GACAC---GTCA"; + final String read2 = "CAC---GTCA"; + assembler.addSequence(getBytes(ref), true); + assembler.addSequence(getBytes(read1), false); + assembler.addSequence(getBytes(read2), false); + assertNonUniques(assembler, "ACA", "CAC"); + } + + @Test(enabled = ! DEBUG) + public void testReadsCreateNonUnique() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(3); + final String ref = "GCAC--GTCA"; // CAC is unique + final String read1 = "GCACACGTCA"; // makes CAC non unique because it has a duplication + final String read2 = "CACGTCA"; // shouldn't be allowed to match CAC as start + assembler.addSequence(getBytes(ref), true); + assembler.addSequence(getBytes(read1), false); + assembler.addSequence(getBytes(read2), false); +// assembler.convertToSequenceGraph().printGraph(new File("test.dot"), 0); + + assertNonUniques(assembler, "CAC"); + //assertSingleBubble(assembler, ref, "CAAAATCGGG"); + } + + @Test(enabled = ! DEBUG) + public void testCountingOfStartEdges() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(3); + final String ref = "NNNGTCAAA"; // ref has some bases before start + final String read1 = "GTCAAA"; // starts at first non N base + + assembler.addSequence(getBytes(ref), true); + assembler.addSequence(getBytes(read1), false); + assembler.buildGraphIfNecessary(); +// assembler.printGraph(new File("test.dot"), 0); + + for ( final MultiSampleEdge edge : assembler.edgeSet() ) { + final MultiDeBruijnVertex source = assembler.getEdgeSource(edge); + final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge); + final boolean headerVertex = source.getSuffix() == 'N' || target.getSuffix() == 'N'; + if ( headerVertex ) { + Assert.assertEquals(edge.getMultiplicity(), 1, "Bases in the unique reference header should have multiplicity of 1"); + } else { + Assert.assertEquals(edge.getMultiplicity(), 2, "Should have multiplicity of 2 for any edge outside the ref header but got " + edge + " " + source + " -> " + target); + } + } + } + + @Test(enabled = !DEBUG) + public void testCountingOfStartEdgesWithMultiplePrefixes() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(3); + assembler.increaseCountsThroughBranches = true; + final String ref = "NNNGTCAXX"; // ref has some bases before start + final String alt1 = "NNNCTCAXX"; // alt1 has SNP right after N + final String read = "TCAXX"; // starts right after SNP, but merges right before branch + + assembler.addSequence(getBytes(ref), true); + assembler.addSequence(getBytes(alt1), false); + assembler.addSequence(getBytes(read), false); + assembler.buildGraphIfNecessary(); + assembler.printGraph(new File("test.dot"), 0); + + final List oneCountVertices = Arrays.asList("NNN", "NNG", "NNC", "NGT", "NCT"); + final List threeCountVertices = Arrays.asList("CAX", "AXX"); + + for ( final MultiSampleEdge edge : assembler.edgeSet() ) { + final MultiDeBruijnVertex source = assembler.getEdgeSource(edge); + final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge); + final int expected = oneCountVertices.contains(target.getSequenceString()) ? 1 : (threeCountVertices.contains(target.getSequenceString()) ? 3 : 2); + Assert.assertEquals(edge.getMultiplicity(), expected, "Bases at edge " + edge + " from " + source + " to " + target + " has bad multiplicity"); + } + } + + // TODO -- update to use determineKmerSizeAndNonUniques directly +// @DataProvider(name = "KmerSizeData") +// public Object[][] makeKmerSizeDataProvider() { +// List tests = new ArrayList(); +// +// // this functionality can be adapted to provide input data for whatever you might want in your data +// tests.add(new Object[]{3, 3, 3, Arrays.asList("ACG"), Arrays.asList()}); +// tests.add(new Object[]{3, 4, 3, Arrays.asList("CAGACG"), Arrays.asList()}); +// +// tests.add(new Object[]{3, 3, 3, Arrays.asList("AAAAC"), Arrays.asList("AAA")}); +// tests.add(new Object[]{3, 4, 4, Arrays.asList("AAAAC"), Arrays.asList()}); +// tests.add(new Object[]{3, 5, 4, Arrays.asList("AAAAC"), Arrays.asList()}); +// tests.add(new Object[]{3, 4, 3, Arrays.asList("CAAA"), Arrays.asList()}); +// tests.add(new Object[]{3, 4, 4, Arrays.asList("CAAAA"), Arrays.asList()}); +// tests.add(new Object[]{3, 5, 4, Arrays.asList("CAAAA"), Arrays.asList()}); +// tests.add(new Object[]{3, 5, 5, Arrays.asList("ACGAAAAACG"), Arrays.asList()}); +// +// for ( int maxSize = 3; maxSize < 20; maxSize++ ) { +// for ( int dupSize = 3; dupSize < 20; dupSize++ ) { +// final int expectedSize = Math.min(maxSize, dupSize); +// final String dup = Utils.dupString("C", dupSize); +// final List nonUnique = dupSize > maxSize ? Arrays.asList(Utils.dupString("C", maxSize)) : Collections.emptyList(); +// tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("ACGT", "A" + dup + "GT"), nonUnique}); +// tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("A" + dup + "GT", "ACGT"), nonUnique}); +// } +// } +// +// return tests.toArray(new Object[][]{}); +// } +// +// /** +// * Example testng test using MyDataProvider +// */ +// @Test(dataProvider = "KmerSizeData") +// public void testDynamicKmerSizing(final int min, final int max, final int expectKmer, final List seqs, final List expectedNonUniques) { +// final ReadThreadingGraph assembler = new ReadThreadingGraph(min, max); +// for ( String seq : seqs ) assembler.addSequence(seq.getBytes(), false); +// assembler.buildGraphIfNecessary(); +// Assert.assertEquals(assembler.getKmerSize(), expectKmer); +// assertNonUniques(assembler, expectedNonUniques.toArray(new String[]{})); +// } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java new file mode 100644 index 000000000..7c3160c30 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java @@ -0,0 +1,80 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class SequenceForKmersUnitTest extends BaseTest { + @Test + public void testNoCount() { + final byte[] seq = "ACGT".getBytes(); + final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, null, true); + Assert.assertEquals(sk.name, "foo"); + Assert.assertEquals(sk.sequence, seq); + Assert.assertEquals(sk.start, 0); + Assert.assertEquals(sk.stop, seq.length); + Assert.assertEquals(sk.isRef, true); + for ( int i = 0; i < seq.length; i++ ) + Assert.assertEquals(sk.getCount(i), 1); + } + + @Test + public void testWithCounts() { + final int len = 256; + final int[] counts = new int[len]; + for ( int i = 0; i < len; i++ ) counts[i] = i; + final byte[] seq = Utils.dupBytes((byte)'A', len); + + final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, counts, true); + + for ( int i = 0; i < seq.length; i++ ) + Assert.assertEquals(sk.getCount(i), i); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 1daaaf1da..f9a4fcdbb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -438,7 +438,7 @@ public class TraverseActiveRegions extends TraversalEngine= 0 + */ + public static int longestCommonPrefix(final byte[] seq1, final byte[] seq2, final int maxLength) { + if ( seq1 == null ) throw new IllegalArgumentException("seq1 is null"); + if ( seq2 == null ) throw new IllegalArgumentException("seq2 is null"); + if ( maxLength < 0 ) throw new IllegalArgumentException("maxLength < 0 " + maxLength); + + final int end = Math.min(seq1.length, Math.min(seq2.length, maxLength)); + for ( int i = 0; i < end; i++ ) { + if ( seq1[i] != seq2[i] ) + return i; + } + return end; + } + + /** + * Get the length of the longest common suffix of seq1 and seq2 + * @param seq1 non-null byte array + * @param seq2 non-null byte array + * @param maxLength the maximum allowed length to return + * @return the length of the longest common suffix of seq1 and seq2, >= 0 + */ + public static int longestCommonSuffix(final byte[] seq1, final byte[] seq2, final int maxLength) { + if ( seq1 == null ) throw new IllegalArgumentException("seq1 is null"); + if ( seq2 == null ) throw new IllegalArgumentException("seq2 is null"); + if ( maxLength < 0 ) throw new IllegalArgumentException("maxLength < 0 " + maxLength); + + final int end = Math.min(seq1.length, Math.min(seq2.length, maxLength)); + for ( int i = 0; i < end; i++ ) { + if ( seq1[seq1.length - i - 1] != seq2[seq2.length - i - 1] ) + return i; + } + return end; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java index 890faa82a..78f81ec5e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java @@ -45,7 +45,7 @@ import java.util.*; * Date: Mar 23, 2009 * Time: 1:54:54 PM */ -public final class SWPairwiseAlignment { +public final class SWPairwiseAlignment implements SmithWaterman { private int alignment_offset; // offset of s2 w/respect to s1 private Cigar alignmentCigar; @@ -57,7 +57,7 @@ public final class SWPairwiseAlignment { private static final int CLIP = 3; protected static boolean cutoff = false; - private static boolean DO_SOFTCLIP = true; + private boolean doSoftClipping = true; /** * The SW scoring matrix, stored for debugging purposes if keepScoringMatrix is true @@ -90,10 +90,23 @@ public final class SWPairwiseAlignment { * @param parameters the SW parameters to use */ public SWPairwiseAlignment(byte[] seq1, byte[] seq2, Parameters parameters) { - this.parameters = parameters; + this(parameters); align(seq1,seq2); } + /** + * Create a new SW pairwise aligner, without actually doing any alignment yet + * + * @param parameters the SW parameters to use + */ + protected SWPairwiseAlignment(Parameters parameters) { + this.parameters = parameters; + } + + protected void setDoSoftClipping(final boolean doSoftClipping) { + this.doSoftClipping = doSoftClipping; + } + /** * Create a new SW pairwise aligner * @@ -111,8 +124,10 @@ public final class SWPairwiseAlignment { this(seq1,seq2,SWParameterSet.ORIGINAL_DEFAULT); } + @Override public Cigar getCigar() { return alignmentCigar ; } + @Override public int getAlignmentStart2wrt1() { return alignment_offset; } public void align(final byte[] a, final byte[] b) { @@ -265,7 +280,7 @@ public final class SWPairwiseAlignment { List lce = new ArrayList(5); - if ( segment_length > 0 && DO_SOFTCLIP ) { + if ( segment_length > 0 && doSoftClipping ) { lce.add(makeElement(CLIP, segment_length)); segment_length = 0; } @@ -316,7 +331,7 @@ public final class SWPairwiseAlignment { // last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M if // DO_SOFTCLIP is false or 2S3M if DO_SOFTCLIP is true. // The consumers need to check for the alignment offset and deal with it properly. - if (DO_SOFTCLIP ) { + if (doSoftClipping ) { lce.add(makeElement(state, segment_length)); if ( p2> 0 ) lce.add(makeElement(CLIP, p2)); alignment_offset = p1 ; @@ -360,7 +375,7 @@ public final class SWPairwiseAlignment { Cigar cigar = getCigar(); - if ( ! DO_SOFTCLIP ) { + if ( ! doSoftClipping ) { // we need to go through all the hassle below only if we do not do softclipping; // otherwise offset is never negative diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java new file mode 100644 index 000000000..44fd889c5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.smithwaterman; + +import net.sf.samtools.Cigar; + +/** + * Generic interface for SmithWaterman calculations + * + * This interface allows clients to use a generic SmithWaterman variable, without propogating the specific + * implementation of SmithWaterman throughout their code: + * + * SmithWaterman sw = new SpecificSmithWatermanImplementation(ref, read, params) + * sw.getCigar() + * sw.getAlignmentStart2wrt1() + * + * User: depristo + * Date: 4/26/13 + * Time: 8:24 AM + */ +public interface SmithWaterman { + /** + * Get the cigar string for the alignment of this SmithWaterman class + * @return a non-null cigar + */ + public Cigar getCigar(); + + /** + * Get the starting position of the read sequence in the reference sequence + * @return a positive integer >= 0 + */ + public int getAlignmentStart2wrt1(); +} diff --git a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java index 154b000ce..3c68b8753 100644 --- a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java @@ -29,6 +29,7 @@ import org.apache.commons.io.FileUtils; import org.broadinstitute.sting.utils.io.IOUtils; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; @@ -189,4 +190,30 @@ public class UtilsUnitTest extends BaseTest { final String sourceString = FileUtils.readFileToString(source); Assert.assertEquals(Utils.calcMD5(sourceString), sourceMD5); } + + @Test + public void testLongestCommonOps() { + for ( int prefixLen = 0; prefixLen < 20; prefixLen++ ) { + for ( int extraSeq1Len = 0; extraSeq1Len < 10; extraSeq1Len++ ) { + for ( int extraSeq2Len = 0; extraSeq2Len < 10; extraSeq2Len++ ) { + for ( int max = 0; max < 50; max++ ) { + final String prefix = Utils.dupString("A", prefixLen); + final int expected = Math.min(prefixLen, max); + + { + final String seq1 = prefix + Utils.dupString("C", extraSeq1Len); + final String seq2 = prefix + Utils.dupString("G", extraSeq1Len); + Assert.assertEquals(Utils.longestCommonPrefix(seq1.getBytes(), seq2.getBytes(), max), expected, "LongestCommonPrefix failed: seq1 " + seq1 + " seq2 " + seq2 + " max " + max); + } + + { + final String seq1 = Utils.dupString("C", extraSeq1Len) + prefix; + final String seq2 = Utils.dupString("G", extraSeq1Len) + prefix; + Assert.assertEquals(Utils.longestCommonSuffix(seq1.getBytes(), seq2.getBytes(), max), expected, "longestCommonSuffix failed: seq1 " + seq1 + " seq2 " + seq2 + " max " + max); + } + } + } + } + } + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index ae7c1e01c..6ec4336b0 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -33,8 +33,10 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -86,6 +88,30 @@ public class ReadClipperUnitTest extends BaseTest { } } + @DataProvider(name = "ClippedReadLengthData") + public Object[][] makeClippedReadLengthData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final int originalReadLength = 50; + for ( int nToClip = 1; nToClip < originalReadLength - 1; nToClip++ ) { + tests.add(new Object[]{originalReadLength, nToClip}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ClippedReadLengthData", enabled = true) + public void testHardClipReadLengthIsRight(final int originalReadLength, final int nToClip) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(originalReadLength + "M"); + read.getReadLength(); // provoke the caching of the read length + final int expectedReadLength = originalReadLength - nToClip; + GATKSAMRecord clipped = ReadClipper.hardClipByReadCoordinates(read, 0, nToClip - 1); + Assert.assertEquals(clipped.getReadLength(), expectedReadLength, + String.format("Clipped read length %d with cigar %s not equal to the expected read length %d after clipping %d bases from the left from a %d bp read with cigar %s", + clipped.getReadLength(), clipped.getCigar(), expectedReadLength, nToClip, read.getReadLength(), read.getCigar())); + } + @Test(enabled = true) public void testHardClipByReferenceCoordinates() { for (Cigar cigar : cigarList) {