Merge branch 'master' of github.com:broadinstitute/gsa-unstable

This commit is contained in:
Menachem Fromer 2013-05-14 10:15:21 -04:00
commit de54223aed
88 changed files with 5919 additions and 1422 deletions

View File

@ -1031,6 +1031,7 @@
<delete dir="${staging.dir}"/>
<delete dir="${dist.dir}"/>
<delete dir="${pipelinetest.dir}"/>
<delete dir="${integration.tests.dir}"/>
</target>
<!-- Depend on this target if your target requires a clean working directory but you don't want to depend on clean directly -->
@ -1043,6 +1044,7 @@
<available file="${staging.dir}" />
<available file="${dist.dir}" />
<available file="${pipelinetest.dir}" />
<available file="${integration.tests.dir}" />
<available file="${javadoc.dir}" />
<available file="${scaladoc.dir}" />
<available file="${gatkdocs.dir}" />
@ -1078,6 +1080,7 @@
<property name="scala.public.test.sources" value="${public.dir}/scala/test"/>
<property name="scala.private.test.sources" value="${private.dir}/scala/test"/>
<property name="scala.protected.test.sources" value="${protected.dir}/scala/test"/>
<property name="integration.tests.dir" value="integrationtests" />
<property name="pipelinetest.dir" value="pipelinetests" />
<property name="report" value="${build.dir}/report"/>
<property name="iwww.report.dir" value="${user.home}/private_html/report"/>

View File

@ -219,6 +219,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Argument(fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.")
public String FORCE_READGROUP = null;
@Hidden
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false)
public PrintStream RECAL_TABLE_UPDATE_LOG = null;

View File

@ -64,6 +64,7 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -236,6 +237,15 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
public int downsampleCoverage = 250;
/**
* Generally, this tool is not meant to be run for more than 1 sample at a time. The one valid exception
* brought to our attention by colleagues is the specific case of tumor/normal pairs in cancer analysis.
* To prevent users from unintentionally running the tool in a less than ideal manner, we require them
* to explicitly enable multi-sample analysis with this argument.
*/
@Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "enable multi-samples reduction for cancer analysis", required = false)
public boolean ALLOW_MULTIPLE_SAMPLES = false;
@Hidden
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
public boolean nwayout = false;
@ -294,6 +304,9 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 )
throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
if ( SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()).size() > 1 && !ALLOW_MULTIPLE_SAMPLES )
throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis");
if ( known.isEmpty() )
knownSnpPositions = null;
else

View File

@ -877,6 +877,10 @@ public class SlidingWindow {
final int start = region.getStart() - windowHeaderStart;
int stop = region.getStop() - windowHeaderStart;
// make sure the bitset is complete given the region (it might not be in multi-sample mode)
if ( region.getStop() > markedSites.getStartLocation() + markedSites.getVariantSiteBitSet().length )
markSites(region.getStop());
CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions);
allReads.addAll(closeVariantRegionResult.reads);

View File

@ -0,0 +1,142 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.LinkedList;
import java.util.List;
import java.util.TreeSet;
/**
* Trim down an active region based on a set of variants found across the haplotypes within the region
*
* User: depristo
* Date: 4/27/13
* Time: 2:10 PM
*/
class ActiveRegionTrimmer {
private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class);
private final boolean logTrimming;
private final int snpPadding, nonSnpPadding, maxDistanceInExtensionForGenotyping;
private final GenomeLocParser parser;
/**
* Create a new ActiveRegionTrimmer
*
* @param logTrimming should we log our trimming events?
* @param snpPadding how much bp context should we ensure around snps?
* @param nonSnpPadding how much bp context should we ensure around anything not a snp?
* @param maxDistanceInExtensionForGenotyping the max extent we are will to go into the extended region of the
* origin active region in order to properly genotype events in the
* non-extended active region?
* @param parser a genome loc parser so we can create genome locs
*/
ActiveRegionTrimmer(boolean logTrimming, int snpPadding, int nonSnpPadding, int maxDistanceInExtensionForGenotyping, GenomeLocParser parser) {
if ( snpPadding < 0 ) throw new IllegalArgumentException("snpPadding must be >= 0 but got " + snpPadding);
if ( nonSnpPadding < 0 ) throw new IllegalArgumentException("nonSnpPadding must be >= 0 but got " + nonSnpPadding);
if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping);
if ( parser == null ) throw new IllegalArgumentException("parser cannot be null");
this.logTrimming = logTrimming;
this.snpPadding = snpPadding;
this.nonSnpPadding = nonSnpPadding;
this.maxDistanceInExtensionForGenotyping = maxDistanceInExtensionForGenotyping;
this.parser = parser;
}
/**
* Trim down the active region to a region large enough to properly genotype the events found within the active
* region span, excluding all variants that only occur within its extended span.
*
* This function merely creates the region, but it doesn't populate the reads back into the region.
*
* @param region our full active region
* @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position
* @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully
*/
public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet<VariantContext> allVariantsWithinExtendedRegion) {
if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region
return null;
final List<VariantContext> withinActiveRegion = new LinkedList<VariantContext>();
int pad = snpPadding;
GenomeLoc trimLoc = null;
for ( final VariantContext vc : allVariantsWithinExtendedRegion ) {
final GenomeLoc vcLoc = parser.createGenomeLoc(vc);
if ( region.getLocation().overlapsP(vcLoc) ) {
if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding
pad = nonSnpPadding;
trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc);
withinActiveRegion.add(vc);
}
}
// we don't actually have anything in the region after removing variants that don't overlap the region's full location
if ( trimLoc == null ) return null;
final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping);
final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad);
final GenomeLoc finalSpan = maxSpan.intersect(idealSpan);
final ActiveRegion trimmedRegion = region.trim(finalSpan);
if ( logTrimming ) {
logger.info("events : " + withinActiveRegion);
logger.info("trimLoc : " + trimLoc);
logger.info("pad : " + pad);
logger.info("idealSpan : " + idealSpan);
logger.info("maxSpan : " + maxSpan);
logger.info("finalSpan : " + finalSpan);
logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size());
}
return trimmedRegion;
}
}

View File

@ -46,101 +46,53 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
import java.util.*;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* DeBruijn assembler for the HaplotypeCaller
*
* User: ebanks, rpoplin
* Date: Mar 14, 2011
*/
public class DeBruijnAssembler extends LocalAssemblyEngine {
private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
// TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should
// TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where
// TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25;
private final static int NUM_PATHS_PER_GRAPH = 25;
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int GRAPH_KMER_STEP = 6;
private final boolean debug;
private final boolean debugGraphTransformations;
private final int minKmer;
private final boolean allowCyclesInKmerGraphToGeneratePaths;
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
protected DeBruijnAssembler() {
this(false, -1, 11, false);
this(25, -1);
}
public DeBruijnAssembler(final boolean debug,
final int debugGraphTransformations,
final int minKmer,
final boolean allowCyclesInKmerGraphToGeneratePaths) {
super();
this.debug = debug;
this.debugGraphTransformations = debugGraphTransformations > 0;
this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations;
public DeBruijnAssembler(final int minKmer, final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) {
super(NUM_PATHS_PER_GRAPH);
this.minKmer = minKmer;
this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths;
this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
}
/**
* Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
* @param activeRegion ActiveRegion object holding the reads which are to be used during assembly
* @param refHaplotype reference haplotype object
* @param fullReferenceWithPadding byte array holding the reference sequence with padding
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
* @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
* @return a non-empty list of all the haplotypes that are produced during assembly
*/
@Ensures({"result.contains(refHaplotype)"})
public List<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype ) {
if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
// create the graphs
final List<SeqGraph> graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
// print the graphs if the appropriate debug option has been turned on
if( graphWriter != null ) {
printGraphs(graphs);
}
// find the best paths in the graphs and return them as haplotypes
return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
}
@Requires({"reads != null", "refHaplotype != null"})
protected List<SeqGraph> createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
@Override
protected List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype) {
final List<SeqGraph> graphs = new LinkedList<SeqGraph>();
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
@ -165,10 +117,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
" future subsystem will actually go and error correct the reads");
}
final SeqGraph seqGraph = toSeqGraph(graph);
final SeqGraph seqGraph = cleanupSeqGraph(graph.convertToSequenceGraph());
if ( seqGraph != null ) { // if the graph contains interesting variation from the reference
sanityCheckReferenceGraph(seqGraph, refHaplotype);
graphs.add(seqGraph);
if ( debugGraphTransformations ) // we only want to use one graph size
@ -181,69 +132,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
return graphs;
}
private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) {
final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
// TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm
// TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect
// TODO -- to anything from one that's actually has good support along the chain but just happens
// TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately
// TODO -- the pruning algorithm really should be an error correction algorithm that knows more
// TODO -- about the structure of the data and can differentiate between an infrequent path but
// TODO -- without evidence against it (such as occurs when a region is hard to get any reads through)
// TODO -- from a error with lots of weight going along another similar path
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
seqGraph.zipLinearChains();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
// now go through and prune the graph, removing vertices no longer connected to the reference chain
// IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
// edges to maintain graph connectivity.
seqGraph.pruneGraph(pruneFactor);
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
seqGraph.simplifyGraph();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
// happen in cases where for example the reference somehow manages to acquire a cycle, or
// where the entire assembly collapses back into the reference sequence.
if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
return null;
seqGraph.removePathsNotConnectedToRef();
seqGraph.simplifyGraph();
if ( seqGraph.vertexSet().size() == 1 ) {
// we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop
// the code from blowing up.
// TODO -- ref properties should really be on the vertices, not the graph itself
final SeqVertex complete = seqGraph.vertexSet().iterator().next();
final SeqVertex dummy = new SeqVertex("");
seqGraph.addVertex(dummy);
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
}
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
return seqGraph;
}
protected <T extends BaseVertex> void sanityCheckReferenceGraph(final BaseGraph<T> graph, final Haplotype refHaplotype) {
if( graph.getReferenceSourceVertex() == null ) {
throw new IllegalStateException("All reference graphs must have a reference source vertex.");
}
if( graph.getReferenceSinkVertex() == null ) {
throw new IllegalStateException("All reference graphs must have a reference sink vertex.");
}
if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) {
throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path." +
" graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) +
" haplotype = " + new String(refHaplotype.getBases())
);
}
}
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype ) {
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
@ -344,290 +232,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
return true;
}
protected void printGraphs(final List<SeqGraph> graphs) {
final int writeFirstGraphWithSizeSmallerThan = 50;
graphWriter.println("digraph assemblyGraphs {");
for( final SeqGraph graph : graphs ) {
if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) {
logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize());
continue;
}
graph.printGraph(graphWriter, false, pruneFactor);
if ( debugGraphTransformations )
break;
}
graphWriter.println("}");
}
@Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
@Ensures({"result.contains(refHaplotype)"})
private List<Haplotype> findBestPaths( final List<SeqGraph> graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
// TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm
final List<Haplotype> returnHaplotypes = new ArrayList<Haplotype>();
refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
final Cigar c = new Cigar();
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
refHaplotype.setCigar(c);
returnHaplotypes.add( refHaplotype );
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
// for GGA mode, add the desired allele into the haplotype
for( final VariantContext compVC : activeAllelesToGenotype ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true );
}
}
for( final SeqGraph graph : graphs ) {
final SeqVertex source = graph.getReferenceSourceVertex();
final SeqVertex sink = graph.getReferenceSinkVertex();
if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
final KBestPaths<SeqVertex> pathFinder = new KBestPaths<SeqVertex>(allowCyclesInKmerGraphToGeneratePaths);
for ( final Path<SeqVertex> path : pathFinder.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH, source, sink) ) {
// logger.info("Found path " + path);
Haplotype h = new Haplotype( path.getBases() );
if( !returnHaplotypes.contains(h) ) {
final Cigar cigar = path.calculateCigar();
if( cigar.isEmpty() ) {
throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength());
} else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < 60 ) { // N cigar elements means that a bubble was too divergent from the reference so skip over this path
continue;
} else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure
throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength());
}
h.setCigar(cigar);
// extend partial haplotypes which are anchored in the reference to include the full active region
h = extendPartialHaplotype(h, activeRegionStart, refWithPadding);
final Cigar leftAlignedCigar = leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(h.getCigar()), refWithPadding, h.getBases(), activeRegionStart, 0);
if( !returnHaplotypes.contains(h) ) {
h.setAlignmentStartHapwrtRef(activeRegionStart);
h.setCigar(leftAlignedCigar);
h.setScore(path.getScore());
returnHaplotypes.add(h);
if ( debug )
logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
// for GGA mode, add the desired allele into the haplotype if it isn't already present
if( !activeAllelesToGenotype.isEmpty() ) {
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
// This if statement used to additionally have:
// "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
// but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
// a haplotype that already contains a 1bp insertion (so practically it is reference but
// falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
if( vcOnHaplotype == null ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false );
}
}
}
}
}
}
}
}
// add genome locs to the haplotypes
for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow);
if ( returnHaplotypes.size() < returnHaplotypes.size() )
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
if( debug ) {
if( returnHaplotypes.size() > 1 ) {
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
} else {
logger.info("Found only the reference haplotype in the assembly graph.");
}
for( final Haplotype h : returnHaplotypes ) {
logger.info( h.toString() );
logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() );
}
}
return returnHaplotypes;
}
/**
* Extend partial haplotypes which are anchored in the reference to include the full active region
* @param haplotype the haplotype to extend
* @param activeRegionStart the place where the active region starts in the ref byte array
* @param refWithPadding the full reference byte array with padding which encompasses the active region
* @return a haplotype fully extended to encompass the active region
*/
@Requires({"haplotype != null", "activeRegionStart >= 0", "refWithPadding != null", "refWithPadding.length > 0"})
@Ensures({"result != null", "result.getCigar() != null"})
private Haplotype extendPartialHaplotype( final Haplotype haplotype, final int activeRegionStart, final byte[] refWithPadding ) {
final Cigar cigar = haplotype.getCigar();
final Cigar newCigar = new Cigar();
byte[] newHaplotypeBases = haplotype.getBases();
int refPos = activeRegionStart;
int hapPos = 0;
for( int iii = 0; iii < cigar.getCigarElements().size(); iii++ ) {
final CigarElement ce = cigar.getCigarElement(iii);
switch (ce.getOperator()) {
case M:
refPos += ce.getLength();
hapPos += ce.getLength();
newCigar.add(ce);
break;
case I:
hapPos += ce.getLength();
newCigar.add(ce);
break;
case D:
if( iii == 0 || iii == cigar.getCigarElements().size() - 1 ) {
newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos),
ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()),
Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length)));
hapPos += ce.getLength();
refPos += ce.getLength();
newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M));
} else {
refPos += ce.getLength();
newCigar.add(ce);
}
break;
default:
throw new IllegalStateException("Unsupported cigar operator detected: " + ce.getOperator());
}
}
final Haplotype returnHaplotype = new Haplotype(newHaplotypeBases, haplotype.isReference());
returnHaplotype.setCigar( newCigar );
return returnHaplotype;
}
/**
* We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal
* @param c the cigar to test
* @return true if we should skip over this path
*/
@Requires("c != null")
private boolean pathIsTooDivergentFromReference( final Cigar c ) {
for( final CigarElement ce : c.getCigarElements() ) {
if( ce.getOperator().equals(CigarOperator.N) ) {
return true;
}
}
return false;
}
/**
* Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them.
* This is a target of future work to incorporate and generalize into AlignmentUtils for use by others.
* @param cigar the cigar to left align
* @param refSeq the reference byte array
* @param readSeq the read byte array
* @param refIndex 0-based alignment start position on ref
* @param readIndex 0-based alignment start position on read
* @return the left-aligned cigar
*/
@Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"})
protected Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
final Cigar cigarToReturn = new Cigar();
Cigar cigarToAlign = new Cigar();
for (int i = 0; i < cigar.numCigarElements(); i++) {
final CigarElement ce = cigar.getCigarElement(i);
if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) {
cigarToAlign.add(ce);
final Cigar leftAligned = AlignmentUtils.leftAlignSingleIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false);
for ( final CigarElement toAdd : leftAligned.getCigarElements() ) { cigarToReturn.add(toAdd); }
refIndex += cigarToAlign.getReferenceLength();
readIndex += cigarToAlign.getReadLength();
cigarToAlign = new Cigar();
} else {
cigarToAlign.add(ce);
}
}
if( !cigarToAlign.isEmpty() ) {
for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) {
cigarToReturn.add(toAdd);
}
}
final Cigar result = AlignmentUtils.consolidateCigar(cigarToReturn);
if( result.getReferenceLength() != cigar.getReferenceLength() )
throw new IllegalStateException("leftAlignCigarSequentially failed to produce a valid CIGAR. Reference lengths differ. Initial cigar " + cigar + " left aligned into " + result);
return result;
}
/**
* Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype.
* Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information.
* This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based.
* @param haplotype the candidate haplotype
* @param ref the reference bases to align against
* @param haplotypeList the current list of haplotypes
* @param activeRegionStart the start of the active region in the reference byte array
* @param activeRegionStop the stop of the active region in the reference byte array
* @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists
* @return true if the candidate haplotype was successfully incorporated into the haplotype list
*/
@Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"})
private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final List<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
if( haplotype == null ) { return false; }
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS );
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
return false;
}
haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) );
final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true);
int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
}
byte[] newHaplotypeBases;
// extend partial haplotypes to contain the full active region sequence
if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
haplotype.getBases()),
ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
} else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
} else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
} else {
newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
}
final Haplotype h = new Haplotype( newHaplotypeBases );
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS );
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
if ( haplotype.isArtificialHaplotype() ) {
h.setArtificialEvent(haplotype.getArtificialEvent());
}
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
return false;
}
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) );
if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) {
haplotypeList.add(h);
return true;
} else {
return false;
}
@Override
public String toString() {
return "DeBruijnAssembler{" +
"minKmer=" + minKmer +
'}';
}
}

View File

@ -68,6 +68,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
@ -135,10 +136,14 @@ import java.util.*;
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.LOCUS)
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
@ActiveRegionTraversalParameters(extension=200, maxRegion=300)
@ActiveRegionTraversalParameters(extension=100, maxRegion=300)
@ReadFilters({HCMappingQualityFilter.class})
@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250)
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatible {
public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, Integer> implements AnnotatorCompatible, NanoSchedulable {
// -----------------------------------------------------------------------------------------------
// general haplotype caller arguments
// -----------------------------------------------------------------------------------------------
/**
* A raw, unfiltered, highly sensitive callset in VCF format.
*/
@ -185,64 +190,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false)
public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES;
/**
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
*/
@Advanced
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
@Hidden
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
protected String keepRG = null;
@Advanced
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 0;
@Advanced
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
protected int gcpHMM = 10;
@Advanced
@Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
protected int maxNumHaplotypesInPopulation = 25;
@Advanced
@Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false)
protected int minKmer = 11;
/**
* If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling
* when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the
* read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking
* the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads,
* and may make use of them in assembly and calling, where possible.
*/
@Hidden
@Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false)
protected boolean includeUnmappedReads = false;
@Advanced
@Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
protected boolean USE_ALLELES_TRIGGER = false;
@Advanced
@Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false)
protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false;
@Hidden
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
protected boolean justDetermineActiveRegions = false;
@Hidden
@Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false)
protected boolean dontGenotype = false;
@Hidden
@Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
protected boolean errorCorrectKmers = false;
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP is not used in any way for the calculations themselves.
@ -282,10 +229,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));
@Advanced
@Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false)
protected boolean mergeVariantsViaLD = false;
/**
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
*/
@ -295,13 +238,139 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@ArgumentCollection
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
// -----------------------------------------------------------------------------------------------
// arguments to control internal behavior of the debruijn assembler
// -----------------------------------------------------------------------------------------------
@Advanced
@Argument(fullName="useDebruijnAssembler", shortName="useDebruijnAssembler", doc="If specified, we will use the old DeBruijn assembler. Depreciated as of 2.6", required = false)
protected boolean useDebruijnAssembler = false;
@Advanced
@Argument(fullName="minKmerForDebruijnAssembler", shortName="minKmerForDebruijnAssembler", doc="Minimum kmer length to use in the debruijn assembly graph", required = false)
protected int minKmerForDebruijnAssembler = 11;
@Advanced
@Argument(fullName="onlyUseKmerSizeForDebruijnAssembler", shortName="onlyUseKmerSizeForDebruijnAssembler", doc="If specified, we will only build kmer graphs with this kmer size in the debruijn", required = false)
protected int onlyUseKmerSizeForDebruijnAssembler = -1;
// -----------------------------------------------------------------------------------------------
// arguments to control internal behavior of the read threading assembler
// -----------------------------------------------------------------------------------------------
@Advanced
@Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
protected List<Integer> kmerSizes = Arrays.asList(10, 25);
/**
* Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype
* considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the
* run of the haplotype caller we only take maxPathsPerSample * nSample paths from the graph, in order of their
* weights, no matter how many paths are possible to generate from the graph. Putting this number too low
* will result in dropping true variation because paths that include the real variant are not even considered.
*/
@Advanced
@Argument(fullName="maxPathsPerSample", shortName="maxPathsPerSample", doc="Max number of paths to consider for the read threading assembler per sample.", required = false)
protected int maxPathsPerSample = 10;
/**
* The minimum number of paths to advance forward for genotyping, regardless of the
* number of samples
*/
private final static int MIN_PATHS_PER_GRAPH = 128;
@Hidden
@Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false)
protected boolean dontRecoverDanglingTails = false;
// -----------------------------------------------------------------------------------------------
// general advanced arguments to control haplotype caller behavior
// -----------------------------------------------------------------------------------------------
@Advanced
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 2;
@Advanced
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
protected int gcpHMM = 10;
/**
* If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling
* when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the
* read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking
* the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads,
* and may make use of them in assembly and calling, where possible.
*/
@Hidden
@Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false)
protected boolean includeUnmappedReads = false;
@Advanced
@Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
protected boolean USE_ALLELES_TRIGGER = false;
@Advanced
@Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false)
protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false;
/**
* The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their
* mapping quality. This term effects the probability that a read originated from the reference haplotype, regardless of
* its edit distance from the reference, in that the read could have originated from the reference haplotype but
* from another location in the genome. Suppose a read has many mismatches from the reference, say like 5, but
* has a very high mapping quality of 60. Without this parameter, the read would contribute 5 * Q30 evidence
* in favor of its 5 mismatch haplotype compared to reference, potentially enough to make a call off that single
* read for all of these events. With this parameter set to Q30, though, the maximum evidence against the reference
* that this (and any) read could contribute against reference is Q30.
*
* Set this term to any negative number to turn off the global mapping rate
*/
@Advanced
@Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false)
protected int phredScaledGlobalReadMismappingRate = 60;
@Advanced
@Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
protected int maxNumHaplotypesInPopulation = 25;
@Advanced
@Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false)
protected boolean mergeVariantsViaLD = false;
// -----------------------------------------------------------------------------------------------
// arguments for debugging / developing the haplotype caller
// -----------------------------------------------------------------------------------------------
/**
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
*/
@Hidden
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
@Hidden
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
protected String keepRG = null;
@Hidden
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
protected boolean justDetermineActiveRegions = false;
@Hidden
@Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false)
protected boolean dontGenotype = false;
@Hidden
@Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
protected boolean errorCorrectKmers = false;
@Advanced
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
protected boolean DEBUG;
@Advanced
@Hidden
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false)
protected int debugGraphTransformations = -1;
protected boolean debugGraphTransformations = false;
@Hidden // TODO -- not currently useful
@Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false)
@ -311,10 +380,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false)
protected boolean dontTrimActiveRegions = false;
@Hidden
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
protected boolean dontUseSoftClippedBases = false;
@Hidden
@Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false)
protected boolean allowCyclesInKmerGraphToGeneratePaths = false;
// -----------------------------------------------------------------------------------------------
// done with Haplotype caller parameters
// -----------------------------------------------------------------------------------------------
// the UG engines
private UnifiedGenotyperEngine UG_engine = null;
@ -344,12 +420,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
// the maximum extent into the full active region extension that we're willing to go in genotyping our events
private final static int MAX_GENOTYPING_ACTIVE_REGION_EXTENSION = 25;
private ActiveRegionTrimmer trimmer = null;
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
// bases with quality less than or equal to this value are trimmed off the tails of the reads
private static final byte MIN_TAIL_QUALITY = 20;
// the minimum length of a read we'd consider using for genotyping
private final static int MIN_READ_LENGTH = 10;
private List<String> samplesList = new ArrayList<String>();
private final static double LOG_ONE_HALF = -Math.log10(2.0);
private final static double LOG_ONE_THIRD = -Math.log10(3.0);
@ -373,6 +454,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
// get all of the unique sample names
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
samplesList.addAll( samples );
final int nSamples = samples.size();
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
@ -428,14 +510,36 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
}
// setup the assembler
assemblyEngine = new DeBruijnAssembler(DEBUG, debugGraphTransformations, minKmer, allowCyclesInKmerGraphToGeneratePaths);
// create and setup the assembler
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
assemblyEngine = useDebruijnAssembler
? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes);
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
assemblyEngine.setDebug(DEBUG);
assemblyEngine.setDebugGraphTransformations(debugGraphTransformations);
assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths);
assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails);
if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter);
if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1);
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
// setup the likelihood calculation engine
if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1;
// configure the global mismapping rate
final double log10GlobalReadMismappingRate;
if ( phredScaledGlobalReadMismappingRate < 0 ) {
log10GlobalReadMismappingRate = - Double.MAX_VALUE;
} else {
log10GlobalReadMismappingRate = QualityUtils.qualToErrorProbLog10(phredScaledGlobalReadMismappingRate);
logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units");
}
// create our likelihood calculation engine
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate );
final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes();
@ -443,6 +547,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
if ( bamWriter != null )
haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader());
trimmer = new ActiveRegionTrimmer(DEBUG, PADDING_AROUND_SNPS_FOR_CALLING, PADDING_AROUND_OTHERS_FOR_CALLING,
MAX_GENOTYPING_ACTIVE_REGION_EXTENSION, getToolkit().getGenomeLocParser());
}
//---------------------------------------------------------------------------------------------------------------
@ -538,15 +645,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
//
//---------------------------------------------------------------------------------------------------------------
private final static List<VariantContext> NO_CALLS = Collections.emptyList();
@Override
public Integer map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) {
public List<VariantContext> map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) {
if ( justDetermineActiveRegions )
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
return 1;
return NO_CALLS;
if( !originalActiveRegion.isActive() ) { return 0; } // Not active so nothing to do!
if( !originalActiveRegion.isActive() ) { return NO_CALLS; } // Not active so nothing to do!
final List<VariantContext> activeAllelesToGenotype = new ArrayList<VariantContext>();
final List<VariantContext> activeAllelesToGenotype = new ArrayList<>();
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
for( final VariantContext vc : allelesToGenotype ) {
if( originalActiveRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) {
@ -555,23 +663,23 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
allelesToGenotype.removeAll( activeAllelesToGenotype );
// No alleles found in this region so nothing to do!
if ( activeAllelesToGenotype.isEmpty() ) { return 0; }
if ( activeAllelesToGenotype.isEmpty() ) { return NO_CALLS; }
} else {
if( originalActiveRegion.size() == 0 ) { return 0; } // No reads here so nothing to do!
if( originalActiveRegion.size() == 0 ) { return NO_CALLS; } // No reads here so nothing to do!
}
// run the local assembler, getting back a collection of information on how we should proceed
final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
// abort early if something is out of the acceptable range
if( assemblyResult.haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
if (dontGenotype) return 1; // user requested we not proceed
if( ! assemblyResult.isVariationPresent() ) { return NO_CALLS; } // only the reference haplotype remains so nothing else to do!
if (dontGenotype) return NO_CALLS; // user requested we not proceed
// filter out reads from genotyping which fail mapping quality based criteria
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
if( assemblyResult.regionForGenotyping.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
if( assemblyResult.regionForGenotyping.size() == 0 ) { return NO_CALLS; } // no reads remain after filtering so nothing else to do!
// evaluate each sample's reads against all haplotypes
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
@ -590,12 +698,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
getToolkit().getGenomeLocParser(),
activeAllelesToGenotype );
for( final VariantContext call : calledHaplotypes.getCalls() ) {
// TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker.
// annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call);
vcfWriter.add( call );
}
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
if ( bamWriter != null ) {
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
bestHaplotypes,
@ -605,7 +708,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
return 1; // One active region was processed during this map call
return calledHaplotypes.getCalls();
}
private final static class AssemblyResult {
@ -613,12 +716,18 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
final ActiveRegion regionForGenotyping;
final byte[] fullReferenceWithPadding;
final GenomeLoc paddedReferenceLoc;
final boolean variationPresent;
private AssemblyResult(List<Haplotype> haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc) {
private AssemblyResult(List<Haplotype> haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) {
this.haplotypes = haplotypes;
this.regionForGenotyping = regionForGenotyping;
this.fullReferenceWithPadding = fullReferenceWithPadding;
this.paddedReferenceLoc = paddedReferenceLoc;
this.variationPresent = variationPresent;
}
public boolean isVariationPresent() {
return variationPresent && haplotypes.size() > 1;
}
}
@ -644,63 +753,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
if ( ! dontTrimActiveRegions ) {
return trimActiveRegion(activeRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc);
} else {
// we don't want to or cannot create a trimmed active region, so go ahead and use the old one
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc);
// we don't want to trim active regions, so go ahead and use the old one
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
}
}
/**
* Trim down the active region to just enough to properly genotype the events among the haplotypes
*
* This function merely creates the region, but it doesn't populate the reads back into the region
*
* @param region our full active region
* @param haplotypes the list of haplotypes we've created from assembly
* @param ref the reference bases over the full padded location
* @param refLoc the span of the reference bases
* @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully
*/
private ActiveRegion createTrimmedRegion(final ActiveRegion region, final List<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc) {
EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG);
final TreeSet<VariantContext> allContexts = EventMap.getAllVariantContexts(haplotypes);
final GenomeLocParser parser = getToolkit().getGenomeLocParser();
if ( allContexts.isEmpty() ) // no variants, so just return the current region
return null;
final List<VariantContext> withinActiveRegion = new LinkedList<VariantContext>();
int pad = PADDING_AROUND_SNPS_FOR_CALLING;
GenomeLoc trimLoc = null;
for ( final VariantContext vc : allContexts ) {
final GenomeLoc vcLoc = parser.createGenomeLoc(vc);
if ( region.getLocation().overlapsP(vcLoc) ) {
if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding
pad = PADDING_AROUND_OTHERS_FOR_CALLING;
trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc);
withinActiveRegion.add(vc);
}
}
// we don't actually have anything in the region after removing variants that don't overlap the region's full location
if ( trimLoc == null ) return null;
final GenomeLoc maxSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(region.getLocation(), MAX_GENOTYPING_ACTIVE_REGION_EXTENSION);
final GenomeLoc idealSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(trimLoc, pad);
final GenomeLoc finalSpan = maxSpan.intersect(idealSpan);
final ActiveRegion trimmedRegion = region.trim(finalSpan);
if ( DEBUG ) {
logger.info("events : " + withinActiveRegion);
logger.info("trimLoc : " + trimLoc);
logger.info("pad : " + pad);
logger.info("idealSpan : " + idealSpan);
logger.info("maxSpan : " + maxSpan);
logger.info("finalSpan : " + finalSpan);
logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size());
}
return trimmedRegion;
}
/**
* Trim down the active region to just enough to properly genotype the events among the haplotypes
*
@ -709,17 +766,24 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
* @param fullReferenceWithPadding the reference bases over the full padded location
* @param paddedReferenceLoc the span of the reference bases
* @return an AssemblyResult containing the trimmed active region with all of the reads we should use
* trimmed down as well, and a revised set of haplotypes. If trimming failed this function
* may choose to use the originalActiveRegion without modification
* trimmed down as well, and a revised set of haplotypes. If trimming down the active region results
* in only the reference haplotype over the non-extended active region, returns null.
*/
private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion,
final List<Haplotype> haplotypes,
final byte[] fullReferenceWithPadding,
final GenomeLoc paddedReferenceLoc) {
final ActiveRegion trimmedActiveRegion = createTrimmedRegion(originalActiveRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc);
if ( DEBUG ) logger.info("Trimming active region " + originalActiveRegion + " with " + haplotypes.size() + " haplotypes");
if ( trimmedActiveRegion == null )
return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc);
EventMap.buildEventMapsForHaplotypes(haplotypes, fullReferenceWithPadding, paddedReferenceLoc, DEBUG);
final TreeSet<VariantContext> allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypes);
final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion);
if ( trimmedActiveRegion == null ) {
// there were no variants found within the active region itself, so just return null
if ( DEBUG ) logger.info("No variation found within the active region, skipping the region :-)");
return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, false);
}
// trim down the haplotypes
final Set<Haplotype> haplotypeSet = new HashSet<Haplotype>(haplotypes.size());
@ -738,8 +802,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() );
if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
if ( DEBUG ) {
logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
for ( final Haplotype remaining: trimmedHaplotypes ) {
logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar());
}
@ -757,7 +821,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
trimmedActiveRegion.clearReads();
trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads));
return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc);
return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
}
/**
@ -787,8 +851,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
@Override
public Integer reduce(Integer cur, Integer sum) {
return cur + sum;
public Integer reduce(List<VariantContext> callsInRegion, Integer numCalledRegions) {
for( final VariantContext call : callsInRegion ) {
// TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker.
// annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call);
vcfWriter.add( call );
}
return (callsInRegion.isEmpty() ? 0 : 1) + numCalledRegions;
}
@Override
@ -804,7 +873,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
final List<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
final List<GATKSAMRecord> finalizedReadList = new ArrayList<>();
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
activeRegion.clearReads();
@ -815,21 +884,23 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
// Loop through the reads hard clipping the adaptor and low quality tails
final List<GATKSAMRecord> readsToUse = new ArrayList<GATKSAMRecord>(finalizedReadList.size());
final List<GATKSAMRecord> readsToUse = new ArrayList<>(finalizedReadList.size());
for( final GATKSAMRecord myRead : finalizedReadList ) {
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
GATKSAMRecord clippedRead = useLowQualityBasesForAssembly ? postAdapterRead : ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
// TODO -- reference haplotype start must be removed
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
// uncomment to remove hard clips from consideration at all
//clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
if ( dontUseSoftClippedBases ) {
// uncomment to remove hard clips from consideration at all
clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
} else {
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
// TODO -- reference haplotype start must be removed
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
}
clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() );
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
@ -843,13 +914,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
private List<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>();
final List<GATKSAMRecord> readsToRemove = new ArrayList<>();
// logger.info("Filtering non-passing regions: n incoming " + activeRegion.getReads().size());
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
if( rec.getReadLength() < 10 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
readsToRemove.add(rec);
// logger.info("\tremoving read " + rec + " len " + rec.getReadLength());
}
}
activeRegion.removeAll( readsToRemove );
// logger.info("Filtered non-passing regions: n remaining " + activeRegion.getReads().size());
return readsToRemove;
}
@ -864,7 +938,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
for( final String sample : samplesList) {
List<GATKSAMRecord> readList = returnMap.get( sample );
if( readList == null ) {
readList = new ArrayList<GATKSAMRecord>();
readList = new ArrayList<>();
returnMap.put(sample, readList);
}
}

View File

@ -46,9 +46,7 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.*;
/**
* generic utility class that counts kmers
@ -97,6 +95,20 @@ public class KMerCounter {
return countsByKMer.values();
}
/**
* Get kmers that have minCount or greater in this counter
* @param minCount only return kmers with count >= this value
* @return a non-null collection of kmers
*/
public Collection<Kmer> getKmersWithCountsAtLeast(final int minCount) {
final List<Kmer> result = new LinkedList<Kmer>();
for ( final CountedKmer countedKmer : getCountedKmers() ) {
if ( countedKmer.count >= minCount )
result.add(countedKmer.kmer);
}
return result;
}
/**
* Remove all current counts, resetting the counter to an empty state
*/

View File

@ -149,6 +149,14 @@ public class Kmer {
return bases;
}
/**
* Get a string representation of the bases of this kmer
* @return a non-null string
*/
public String baseString() {
return new String(bases());
}
/**
* The length of this kmer
* @return an integer >= 0

View File

@ -69,35 +69,54 @@ public class LikelihoodCalculationEngine {
private static final double LOG_ONE_HALF = -Math.log10(2.0);
private final byte constantGCP;
private final double log10globalReadMismappingRate;
private final boolean DEBUG;
private final PairHMM pairHMM;
private final int minReadLength = 20;
private final PairHMM.HMM_IMPLEMENTATION hmmType;
private final ThreadLocal<PairHMM> pairHMM = new ThreadLocal<PairHMM>() {
@Override
protected PairHMM initialValue() {
switch (hmmType) {
case EXACT: return new Log10PairHMM(true);
case ORIGINAL: return new Log10PairHMM(false);
case LOGLESS_CACHING: return new LoglessPairHMM();
default:
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
}
}
};
/**
* The expected rate of random sequencing errors for a read originating from its true haplotype.
*
* For example, if this is 0.01, then we'd expect 1 error per 100 bp.
*/
private final double EXPECTED_ERROR_RATE_PER_BASE = 0.02;
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
switch (hmmType) {
case EXACT:
pairHMM = new Log10PairHMM(true);
break;
case ORIGINAL:
pairHMM = new Log10PairHMM(false);
break;
case LOGLESS_CACHING:
pairHMM = new LoglessPairHMM();
break;
default:
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
}
private final static double EXPECTED_ERROR_RATE_PER_BASE = 0.02;
/**
* Create a new LikelihoodCalculationEngine using provided parameters and hmm to do its calculations
*
* @param constantGCP the gap continuation penalty to use with the PairHMM
* @param debug should we emit debugging information during the calculation?
* @param hmmType the type of the HMM to use
* @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of
* -3 means that the chance that a read doesn't actually belong at this
* location in the genome is 1 in 1000. The effect of this parameter is
* to cap the maximum likelihood difference between the reference haplotype
* and the best alternative haplotype by -3 log units. So if the best
* haplotype is at -10 and this parameter has a value of -3 then even if the
* reference haplotype gets a score of -100 from the pairhmm it will be
* assigned a likelihood of -13.
*/
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate ) {
this.hmmType = hmmType;
this.constantGCP = constantGCP;
DEBUG = debug;
this.DEBUG = debug;
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
}
public LikelihoodCalculationEngine() {
this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3);
}
/**
@ -124,7 +143,7 @@ public class LikelihoodCalculationEngine {
}
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
}
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
@ -132,9 +151,8 @@ public class LikelihoodCalculationEngine {
initializePairHMM(haplotypes, perSampleReadList);
// Add likelihoods for each sample's reads to our stratifiedReadMap
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>();
for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
// evaluate the likelihood of the reads given those haplotypes
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
@ -152,17 +170,16 @@ public class LikelihoodCalculationEngine {
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List<Haplotype> haplotypes, final List<GATKSAMRecord> reads) {
// first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time)
final int numHaplotypes = haplotypes.size();
final Map<Haplotype, Allele> alleleVersions = new HashMap<Haplotype, Allele>(numHaplotypes);
final Map<Haplotype, Allele> alleleVersions = new LinkedHashMap<>(numHaplotypes);
Allele refAllele = null;
for ( final Haplotype haplotype : haplotypes ) {
alleleVersions.put(haplotype, Allele.create(haplotype, true));
final Allele allele = Allele.create(haplotype, true);
alleleVersions.put(haplotype, allele);
if ( haplotype.isReference() ) refAllele = allele;
}
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for( final GATKSAMRecord read : reads ) {
if ( read.getReadLength() < minReadLength )
// don't consider any reads that have a read length < the minimum
continue;
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
@ -177,14 +194,34 @@ public class LikelihoodCalculationEngine {
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
}
// keep track of the reference likelihood and the best non-ref likelihood
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
// iterate over all haplotypes, calculating the likelihood of the read for each haplotype
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj);
final boolean isFirstHaplotype = jjj == 0;
final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else
refLog10l = log10l;
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
}
// ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
// that are very divergent from reference, but are supported by only one read. In effect
// we capping how badly scoring the reference can be for any read by the chance that the read
// itself just doesn't belong here
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
}
}
return perReadAlleleLikelihoodMap;

View File

@ -46,28 +46,388 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.List;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* Abstract base class for all HaplotypeCaller assemblers
*
* User: ebanks
* Date: Mar 14, 2011
*/
public abstract class LocalAssemblyEngine {
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class);
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30;
protected final int numBestHaplotypesPerGraph;
protected boolean debug = false;
protected boolean allowCyclesInKmerGraphToGeneratePaths = false;
protected boolean debugGraphTransformations = false;
protected boolean recoverDanglingTails = true;
protected PrintStream graphWriter = null;
protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE;
protected int pruneFactor = 2;
protected boolean errorCorrectKmers = false;
protected LocalAssemblyEngine() { }
private PrintStream graphWriter = null;
/**
* Create a new LocalAssemblyEngine with all default parameters, ready for use
* @param numBestHaplotypesPerGraph the number of haplotypes to generate for each assembled graph
*/
protected LocalAssemblyEngine(final int numBestHaplotypesPerGraph) {
if ( numBestHaplotypesPerGraph < 1 ) throw new IllegalArgumentException("numBestHaplotypesPerGraph should be >= 1 but got " + numBestHaplotypesPerGraph);
this.numBestHaplotypesPerGraph = numBestHaplotypesPerGraph;
}
/**
* Main subclass function: given reads and a reference haplotype give us graphs to use for constructing
* non-reference haplotypes.
*
* @param reads the reads we're going to assemble
* @param refHaplotype the reference haplotype
* @return a non-null list of reads
*/
protected abstract List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype);
/**
* Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
* @param activeRegion ActiveRegion object holding the reads which are to be used during assembly
* @param refHaplotype reference haplotype object
* @param fullReferenceWithPadding byte array holding the reference sequence with padding
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
* @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
* @return a non-empty list of all the haplotypes that are produced during assembly
*/
public List<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List<VariantContext> activeAllelesToGenotype) {
if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
// create the graphs by calling our subclass assemble method
final List<SeqGraph> graphs = assemble(activeRegion.getReads(), refHaplotype);
// do some QC on the graphs
for ( final SeqGraph graph : graphs ) { sanityCheckGraph(graph, refHaplotype); }
// print the graphs if the appropriate debug option has been turned on
if ( graphWriter != null ) { printGraphs(graphs); }
// find the best paths in the graphs and return them as haplotypes
return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
}
@Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
@Ensures({"result.contains(refHaplotype)"})
protected List<Haplotype> findBestPaths(final List<SeqGraph> graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow) {
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<Haplotype>();
refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
final Cigar c = new Cigar();
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
refHaplotype.setCigar(c);
returnHaplotypes.add( refHaplotype );
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
// for GGA mode, add the desired allele into the haplotype
for( final VariantContext compVC : activeAllelesToGenotype ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true );
}
}
for( final SeqGraph graph : graphs ) {
final SeqVertex source = graph.getReferenceSourceVertex();
final SeqVertex sink = graph.getReferenceSinkVertex();
if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<SeqVertex,BaseEdge>(allowCyclesInKmerGraphToGeneratePaths);
for ( final Path<SeqVertex,BaseEdge> path : pathFinder.getKBestPaths(graph, numBestHaplotypesPerGraph, source, sink) ) {
// logger.info("Found path " + path);
Haplotype h = new Haplotype( path.getBases() );
if( !returnHaplotypes.contains(h) ) {
final Cigar cigar = path.calculateCigar(refHaplotype.getBases());
if ( cigar == null ) {
// couldn't produce a meaningful alignment of haplotype to reference, fail quitely
continue;
} else if( cigar.isEmpty() ) {
throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() +
" but expecting reference length of " + refHaplotype.getCigar().getReferenceLength());
} else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < MIN_HAPLOTYPE_REFERENCE_LENGTH ) {
// N cigar elements means that a bubble was too divergent from the reference so skip over this path
continue;
} else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure
throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length "
+ cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength()
+ " ref = " + refHaplotype + " path " + new String(path.getBases()));
}
h.setCigar(cigar);
h.setAlignmentStartHapwrtRef(activeRegionStart);
h.setScore(path.getScore());
returnHaplotypes.add(h);
if ( debug )
logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
// for GGA mode, add the desired allele into the haplotype if it isn't already present
if( !activeAllelesToGenotype.isEmpty() ) {
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
// This if statement used to additionally have:
// "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
// but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
// a haplotype that already contains a 1bp insertion (so practically it is reference but
// falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
if( vcOnHaplotype == null ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false );
}
}
}
}
}
}
}
// add genome locs to the haplotypes
for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow);
if ( returnHaplotypes.size() < returnHaplotypes.size() )
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
if( debug ) {
if( returnHaplotypes.size() > 1 ) {
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
} else {
logger.info("Found only the reference haplotype in the assembly graph.");
}
for( final Haplotype h : returnHaplotypes ) {
logger.info( h.toString() );
logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() + " ref " + h.isReference());
}
}
return new ArrayList<Haplotype>(returnHaplotypes);
}
/**
* We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal
* @param c the cigar to test
* @return true if we should skip over this path
*/
@Requires("c != null")
private boolean pathIsTooDivergentFromReference( final Cigar c ) {
for( final CigarElement ce : c.getCigarElements() ) {
if( ce.getOperator().equals(CigarOperator.N) ) {
return true;
}
}
return false;
}
/**
* Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype.
* Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information.
* This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based.
* @param haplotype the candidate haplotype
* @param ref the reference bases to align against
* @param haplotypeList the current list of haplotypes
* @param activeRegionStart the start of the active region in the reference byte array
* @param activeRegionStop the stop of the active region in the reference byte array
* @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists
* @return true if the candidate haplotype was successfully incorporated into the haplotype list
*/
@Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"})
private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final Set<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
if( haplotype == null ) { return false; }
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS );
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
return false;
}
haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) );
final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true);
int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
}
byte[] newHaplotypeBases;
// extend partial haplotypes to contain the full active region sequence
if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll(ArrayUtils.addAll(ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
haplotype.getBases()),
ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop));
} else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
} else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
} else {
newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
}
final Haplotype h = new Haplotype( newHaplotypeBases );
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS );
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
if ( haplotype.isArtificialHaplotype() ) {
h.setArtificialEvent(haplotype.getArtificialEvent());
}
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
return false;
}
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) );
if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) {
haplotypeList.add(h);
return true;
} else {
return false;
}
}
protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
// TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm
// TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect
// TODO -- to anything from one that's actually has good support along the chain but just happens
// TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately
// TODO -- the pruning algorithm really should be an error correction algorithm that knows more
// TODO -- about the structure of the data and can differentiate between an infrequent path but
// TODO -- without evidence against it (such as occurs when a region is hard to get any reads through)
// TODO -- from a error with lots of weight going along another similar path
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
seqGraph.zipLinearChains();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
// now go through and prune the graph, removing vertices no longer connected to the reference chain
// IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
// edges to maintain graph connectivity.
seqGraph.pruneGraph(pruneFactor);
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
seqGraph.simplifyGraph();
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
// happen in cases where for example the reference somehow manages to acquire a cycle, or
// where the entire assembly collapses back into the reference sequence.
if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
return null;
seqGraph.removePathsNotConnectedToRef();
seqGraph.simplifyGraph();
if ( seqGraph.vertexSet().size() == 1 ) {
// we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop
// the code from blowing up.
// TODO -- ref properties should really be on the vertices, not the graph itself
final SeqVertex complete = seqGraph.vertexSet().iterator().next();
final SeqVertex dummy = new SeqVertex("");
seqGraph.addVertex(dummy);
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
}
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
return seqGraph;
}
/**
* Perform general QC on the graph to make sure something hasn't gone wrong during assembly
* @param graph the graph to check
* @param refHaplotype the reference haplotype
* @param <T>
*/
private <T extends BaseVertex, E extends BaseEdge> void sanityCheckGraph(final BaseGraph<T,E> graph, final Haplotype refHaplotype) {
sanityCheckReferenceGraph(graph, refHaplotype);
}
/**
* Make sure the reference sequence is properly represented in the provided graph
*
* @param graph the graph to check
* @param refHaplotype the reference haplotype
* @param <T>
*/
private <T extends BaseVertex, E extends BaseEdge> void sanityCheckReferenceGraph(final BaseGraph<T,E> graph, final Haplotype refHaplotype) {
if( graph.getReferenceSourceVertex() == null ) {
throw new IllegalStateException("All reference graphs must have a reference source vertex.");
}
if( graph.getReferenceSinkVertex() == null ) {
throw new IllegalStateException("All reference graphs must have a reference sink vertex.");
}
if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) {
throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path. for graph " + graph +
" graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) +
" haplotype = " + new String(refHaplotype.getBases())
);
}
}
/**
* Print the generated graphs to the graphWriter
* @param graphs a non-null list of graphs to print out
*/
private void printGraphs(final List<SeqGraph> graphs) {
final int writeFirstGraphWithSizeSmallerThan = 50;
graphWriter.println("digraph assemblyGraphs {");
for( final SeqGraph graph : graphs ) {
if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) {
logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize());
continue;
}
graph.printGraph(graphWriter, false, pruneFactor);
if ( debugGraphTransformations )
break;
}
graphWriter.println("}");
}
// -----------------------------------------------------------------------------------------------
//
// getter / setter routines for generic assembler properties
//
// -----------------------------------------------------------------------------------------------
public int getPruneFactor() {
return pruneFactor;
@ -85,10 +445,6 @@ public abstract class LocalAssemblyEngine {
this.errorCorrectKmers = errorCorrectKmers;
}
public PrintStream getGraphWriter() {
return graphWriter;
}
public void setGraphWriter(PrintStream graphWriter) {
this.graphWriter = graphWriter;
}
@ -101,5 +457,35 @@ public abstract class LocalAssemblyEngine {
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
}
public abstract List<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List<VariantContext> activeAllelesToGenotype);
public boolean isDebug() {
return debug;
}
public void setDebug(boolean debug) {
this.debug = debug;
}
public boolean isAllowCyclesInKmerGraphToGeneratePaths() {
return allowCyclesInKmerGraphToGeneratePaths;
}
public void setAllowCyclesInKmerGraphToGeneratePaths(boolean allowCyclesInKmerGraphToGeneratePaths) {
this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths;
}
public boolean isDebugGraphTransformations() {
return debugGraphTransformations;
}
public void setDebugGraphTransformations(boolean debugGraphTransformations) {
this.debugGraphTransformations = debugGraphTransformations;
}
public boolean isRecoverDanglingTails() {
return recoverDanglingTails;
}
public void setRecoverDanglingTails(boolean recoverDanglingTails) {
this.recoverDanglingTails = recoverDanglingTails;
}
}

View File

@ -76,12 +76,10 @@ public class BaseEdge {
}
/**
* Copy constructor
*
* @param toCopy
* Create a new copy of this BaseEdge
*/
public BaseEdge(final BaseEdge toCopy) {
this(toCopy.isRef(), toCopy.getMultiplicity());
public BaseEdge copy() {
return new BaseEdge(isRef(), getMultiplicity());
}
/**
@ -92,6 +90,34 @@ public class BaseEdge {
return multiplicity;
}
/**
* Get the DOT format label for this edge, to be displayed when printing this edge to a DOT file
* @return a non-null string
*/
public String getDotLabel() {
return Integer.toString(getMultiplicity());
}
/**
* Increase the multiplicity of this edge by incr
* @param incr the change in this multiplicity, must be >= 0
*/
public void incMultiplicity(final int incr) {
if ( incr < 0 ) throw new IllegalArgumentException("incr must be >= 0 but got " + incr);
multiplicity += incr;
}
/**
* A special assessor that returns the multiplicity that should be used by pruning algorithm
*
* Can be overloaded by subclasses
*
* @return the multiplicity value that should be used for pruning
*/
public int getPruningMultiplicity() {
return getMultiplicity();
}
/**
* Set the multiplicity of this edge to value
* @param value an integer >= 0
@ -117,23 +143,6 @@ public class BaseEdge {
this.isRef = isRef;
}
/**
* Does this and edge have the same source and target vertices in graph?
*
* @param graph the graph containing both this and edge
* @param edge our comparator edge
* @param <T>
* @return true if we have the same source and target vertices
*/
public <T extends BaseVertex> boolean hasSameSourceAndTarget(final BaseGraph<T> graph, final BaseEdge edge) {
return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
}
// For use when comparing edges across graphs!
public <T extends BaseVertex> boolean seqEquals( final BaseGraph<T> graph, final BaseEdge edge, final BaseGraph<T> graph2 ) {
return (graph.getEdgeSource(this).seqEquals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).seqEquals(graph2.getEdgeTarget(edge)));
}
/**
* Sorts a collection of BaseEdges in decreasing order of weight, so that the most
* heavily weighted is at the start of the list
@ -187,4 +196,12 @@ public class BaseEdge {
if ( edge == null ) throw new IllegalArgumentException("edge cannot be null");
return new BaseEdge(isRef() || edge.isRef(), Math.max(getMultiplicity(), edge.getMultiplicity()));
}
@Override
public String toString() {
return "BaseEdge{" +
"multiplicity=" + multiplicity +
", isRef=" + isRef +
'}';
}
}

View File

@ -66,34 +66,16 @@ import java.util.*;
* Date: 2/6/13
*/
@Invariant("!this.isAllowingMultipleEdges()")
public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, BaseEdge> {
public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends DefaultDirectedGraph<V, E> {
protected final static Logger logger = Logger.getLogger(BaseGraph.class);
private final int kmerSize;
/**
* Construct an empty BaseGraph
*/
public BaseGraph() {
this(11);
}
/**
* Edge factory that creates non-reference multiplicity 1 edges
* @param <T> the new of our vertices
*/
private static class MyEdgeFactory<T extends BaseVertex> implements EdgeFactory<T, BaseEdge> {
@Override
public BaseEdge createEdge(T sourceVertex, T targetVertex) {
return new BaseEdge(false, 1);
}
}
/**
* Construct a DeBruijnGraph with kmerSize
* @param kmerSize
*/
public BaseGraph(final int kmerSize) {
super(new MyEdgeFactory<T>());
public BaseGraph(final int kmerSize, final EdgeFactory<V,E> edgeFactory) {
super(edgeFactory);
if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize);
this.kmerSize = kmerSize;
@ -111,7 +93,7 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param v the vertex to test
* @return true if this vertex is a reference node (meaning that it appears on the reference path in the graph)
*/
public boolean isReferenceNode( final T v ) {
public boolean isReferenceNode( final V v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
for( final BaseEdge e : edgesOf(v) ) {
if( e.isRef() ) { return true; }
@ -123,7 +105,7 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param v the vertex to test
* @return true if this vertex is a source node (in degree == 0)
*/
public boolean isSource( final T v ) {
public boolean isSource( final V v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
return inDegreeOf(v) == 0;
}
@ -132,7 +114,7 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param v the vertex to test
* @return true if this vertex is a sink node (out degree == 0)
*/
public boolean isSink( final T v ) {
public boolean isSink( final V v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
return outDegreeOf(v) == 0;
}
@ -141,9 +123,9 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* Get the set of source vertices of this graph
* @return a non-null set
*/
public Set<T> getSources() {
final Set<T> set = new LinkedHashSet<T>();
for ( final T v : vertexSet() )
public Set<V> getSources() {
final Set<V> set = new LinkedHashSet<V>();
for ( final V v : vertexSet() )
if ( isSource(v) )
set.add(v);
return set;
@ -153,9 +135,9 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* Get the set of sink vertices of this graph
* @return a non-null set
*/
public Set<T> getSinks() {
final Set<T> set = new LinkedHashSet<T>();
for ( final T v : vertexSet() )
public Set<V> getSinks() {
final Set<V> set = new LinkedHashSet<V>();
for ( final V v : vertexSet() )
if ( isSink(v) )
set.add(v);
return set;
@ -167,7 +149,7 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @return non-null byte array
*/
@Ensures({"result != null"})
public byte[] getAdditionalSequence( final T v ) {
public byte[] getAdditionalSequence( final V v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to pull sequence from a null vertex."); }
return v.getAdditionalSequence(isSource(v));
}
@ -176,9 +158,9 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param e the edge to test
* @return true if this edge is a reference source edge
*/
public boolean isRefSource( final BaseEdge e ) {
public boolean isRefSource( final E e ) {
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
for( final BaseEdge edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) {
for( final E edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) {
if( edgeToTest.isRef() ) { return false; }
}
return true;
@ -188,9 +170,9 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param v the vertex to test
* @return true if this vertex is a reference source
*/
public boolean isRefSource( final T v ) {
public boolean isRefSource( final V v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
for( final BaseEdge edgeToTest : incomingEdgesOf(v) ) {
for( final E edgeToTest : incomingEdgesOf(v) ) {
if( edgeToTest.isRef() ) { return false; }
}
return true;
@ -200,31 +182,41 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param e the edge to test
* @return true if this edge is a reference sink edge
*/
public boolean isRefSink( final BaseEdge e ) {
public boolean isRefSink( final E e ) {
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
for( final BaseEdge edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) {
for( final E edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) {
if( edgeToTest.isRef() ) { return false; }
}
return true;
}
/**
* // TODO -- the logic of this test is just wrong
* @param v the vertex to test
* @return true if this vertex is a reference sink
*/
public boolean isRefSink( final T v ) {
public boolean isRefSink( final V v ) {
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
for( final BaseEdge edgeToTest : outgoingEdgesOf(v) ) {
for( final E edgeToTest : outgoingEdgesOf(v) ) {
if( edgeToTest.isRef() ) { return false; }
}
return true;
}
/**
* Is this both a refsink node and a reference node
* @param v a non-null vertex
* @return true if v is both a sink and a reference node
*/
public boolean isRefNodeAndRefSink(final V v) {
return isRefSink(v) && isReferenceNode(v);
}
/**
* @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph
*/
public T getReferenceSourceVertex( ) {
for( final T v : vertexSet() ) {
public V getReferenceSourceVertex( ) {
for( final V v : vertexSet() ) {
if( isReferenceNode(v) && isRefSource(v) ) {
return v;
}
@ -235,8 +227,8 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
/**
* @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph
*/
public T getReferenceSinkVertex( ) {
for( final T v : vertexSet() ) {
public V getReferenceSinkVertex( ) {
for( final V v : vertexSet() ) {
if( isReferenceNode(v) && isRefSink(v) ) {
return v;
}
@ -249,9 +241,9 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param v the current vertex, can be null
* @return the next reference vertex if it exists
*/
public T getNextReferenceVertex( final T v ) {
public V getNextReferenceVertex( final V v ) {
if( v == null ) { return null; }
for( final BaseEdge edgeToTest : outgoingEdgesOf(v) ) {
for( final E edgeToTest : outgoingEdgesOf(v) ) {
if( edgeToTest.isRef() ) {
return getEdgeTarget(edgeToTest);
}
@ -264,9 +256,9 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param v the current vertex, can be null
* @return the previous reference vertex if it exists
*/
public T getPrevReferenceVertex( final T v ) {
public V getPrevReferenceVertex( final V v ) {
if( v == null ) { return null; }
for( final BaseEdge edgeToTest : incomingEdgesOf(v) ) {
for( final E edgeToTest : incomingEdgesOf(v) ) {
if( isReferenceNode(getEdgeSource(edgeToTest)) ) {
return getEdgeSource(edgeToTest);
}
@ -280,8 +272,8 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param toVertex to this vertex, can be null
* @return true if a reference path exists in the graph between the two vertices
*/
public boolean referencePathExists(final T fromVertex, final T toVertex) {
T v = fromVertex;
public boolean referencePathExists(final V fromVertex, final V toVertex) {
V v = fromVertex;
if( v == null ) {
return false;
}
@ -306,12 +298,12 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param includeStop should the ending vertex be included in the path
* @return byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example)
*/
public byte[] getReferenceBytes( final T fromVertex, final T toVertex, final boolean includeStart, final boolean includeStop ) {
public byte[] getReferenceBytes( final V fromVertex, final V toVertex, final boolean includeStart, final boolean includeStop ) {
if( fromVertex == null ) { throw new IllegalArgumentException("Starting vertex in requested path cannot be null."); }
if( toVertex == null ) { throw new IllegalArgumentException("From vertex in requested path cannot be null."); }
byte[] bytes = null;
T v = fromVertex;
V v = fromVertex;
if( includeStart ) {
bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v));
}
@ -330,8 +322,8 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* Convenience function to add multiple vertices to the graph at once
* @param vertices one or more vertices to add
*/
public void addVertices(final T ... vertices) {
for ( final T v : vertices )
public void addVertices(final V... vertices) {
for ( final V v : vertices )
addVertex(v);
}
@ -339,8 +331,8 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* Convenience function to add multiple vertices to the graph at once
* @param vertices one or more vertices to add
*/
public void addVertices(final Collection<T> vertices) {
for ( final T v : vertices )
public void addVertices(final Collection<V> vertices) {
for ( final V v : vertices )
addVertex(v);
}
@ -349,8 +341,12 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param start the first vertex to connect
* @param remaining all additional vertices to connect
*/
public void addEdges(final T start, final T ... remaining) {
addEdges(new BaseEdge(false, 1), start, remaining);
public void addEdges(final V start, final V... remaining) {
V prev = start;
for ( final V next : remaining ) {
addEdge(prev, next);
prev = next;
}
}
/**
@ -358,10 +354,10 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param start the first vertex to connect
* @param remaining all additional vertices to connect
*/
public void addEdges(final BaseEdge template, final T start, final T ... remaining) {
T prev = start;
for ( final T next : remaining ) {
addEdge(prev, next, new BaseEdge(template));
public void addEdges(final E template, final V start, final V... remaining) {
V prev = start;
for ( final V next : remaining ) {
addEdge(prev, next, (E)(template.copy())); // TODO -- is there a better way to do this?
prev = next;
}
}
@ -371,9 +367,9 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param v a non-null vertex
* @return a set of vertices connected by outgoing edges from v
*/
public Set<T> outgoingVerticesOf(final T v) {
final Set<T> s = new LinkedHashSet<T>();
for ( final BaseEdge e : outgoingEdgesOf(v) ) {
public Set<V> outgoingVerticesOf(final V v) {
final Set<V> s = new LinkedHashSet<V>();
for ( final E e : outgoingEdgesOf(v) ) {
s.add(getEdgeTarget(e));
}
return s;
@ -384,9 +380,9 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param v a non-null vertex
* @return a set of vertices {X} connected X -> v
*/
public Set<T> incomingVerticesOf(final T v) {
final Set<T> s = new LinkedHashSet<T>();
for ( final BaseEdge e : incomingEdgesOf(v) ) {
public Set<V> incomingVerticesOf(final V v) {
final Set<V> s = new LinkedHashSet<V>();
for ( final E e : incomingEdgesOf(v) ) {
s.add(getEdgeSource(e));
}
return s;
@ -413,15 +409,16 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
if ( writeHeader )
graphWriter.println("digraph assemblyGraphs {");
for( final BaseEdge edge : edgeSet() ) {
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];");
for( final E edge : edgeSet() ) {
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getDotLabel() + "\"];");
if( edge.isRef() ) {
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];");
}
}
for( final T v : vertexSet() ) {
graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + "\",shape=box]");
for( final V v : vertexSet() ) {
// graphWriter.println("\t" + v.toString() + " [label=\"" + v + "\",shape=box]");
graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + v.additionalInfo() + "\",shape=box]");
}
if ( writeHeader )
@ -439,10 +436,10 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
}
// Remove non-ref edges connected before and after the reference path
final Set<BaseEdge> edgesToCheck = new HashSet<BaseEdge>();
final Set<E> edgesToCheck = new HashSet<E>();
edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex()));
while( !edgesToCheck.isEmpty() ) {
final BaseEdge e = edgesToCheck.iterator().next();
final E e = edgesToCheck.iterator().next();
if( !e.isRef() ) {
edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) );
removeEdge(e);
@ -452,7 +449,7 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
edgesToCheck.addAll(outgoingEdgesOf(getReferenceSinkVertex()));
while( !edgesToCheck.isEmpty() ) {
final BaseEdge e = edgesToCheck.iterator().next();
final E e = edgesToCheck.iterator().next();
if( !e.isRef() ) {
edgesToCheck.addAll( outgoingEdgesOf(getEdgeTarget(e)) );
removeEdge(e);
@ -469,9 +466,9 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
*/
public void pruneGraph( final int pruneFactor ) {
final List<BaseEdge> edgesToRemove = new ArrayList<BaseEdge>();
for( final BaseEdge e : edgeSet() ) {
if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
final List<E> edgesToRemove = new ArrayList<>();
for( final E e : edgeSet() ) {
if( e.getPruningMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
edgesToRemove.add(e);
}
}
@ -480,13 +477,25 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
removeSingletonOrphanVertices();
}
/**
* Prune all chains from this graph where all edges in the path have multiplicity <= pruneFactor
*
* @see LowWeightChainPruner for more information
*
* @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
*/
public void pruneLowWeightChains( final int pruneFactor ) {
final LowWeightChainPruner<V,E> pruner = new LowWeightChainPruner<>(pruneFactor);
pruner.pruneLowWeightChains(this);
}
/**
* Remove all vertices in the graph that have in and out degree of 0
*/
protected void removeSingletonOrphanVertices() {
// Run through the graph and clean up singular orphaned nodes
final List<T> verticesToRemove = new LinkedList<T>();
for( final T v : vertexSet() ) {
final List<V> verticesToRemove = new LinkedList<>();
for( final V v : vertexSet() ) {
if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) {
verticesToRemove.add(v);
}
@ -499,11 +508,11 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* regardless of its direction, from the reference source vertex
*/
public void removeVerticesNotConnectedToRefRegardlessOfEdgeDirection() {
final HashSet<T> toRemove = new HashSet<T>(vertexSet());
final HashSet<V> toRemove = new HashSet<>(vertexSet());
final T refV = getReferenceSourceVertex();
final V refV = getReferenceSourceVertex();
if ( refV != null ) {
for ( final T v : new BaseGraphIterator<T>(this, refV, true, true) ) {
for ( final V v : new BaseGraphIterator<>(this, refV, true, true) ) {
toRemove.remove(v);
}
}
@ -524,22 +533,31 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
}
// get the set of vertices we can reach by going forward from the ref source
final Set<T> onPathFromRefSource = new HashSet<T>(vertexSet().size());
for ( final T v : new BaseGraphIterator<T>(this, getReferenceSourceVertex(), false, true) ) {
final Set<V> onPathFromRefSource = new HashSet<>(vertexSet().size());
for ( final V v : new BaseGraphIterator<>(this, getReferenceSourceVertex(), false, true) ) {
onPathFromRefSource.add(v);
}
// get the set of vertices we can reach by going backward from the ref sink
final Set<T> onPathFromRefSink = new HashSet<T>(vertexSet().size());
for ( final T v : new BaseGraphIterator<T>(this, getReferenceSinkVertex(), true, false) ) {
final Set<V> onPathFromRefSink = new HashSet<>(vertexSet().size());
for ( final V v : new BaseGraphIterator<>(this, getReferenceSinkVertex(), true, false) ) {
onPathFromRefSink.add(v);
}
// we want to remove anything that's not in both the sink and source sets
final Set<T> verticesToRemove = new HashSet<T>(vertexSet());
final Set<V> verticesToRemove = new HashSet<>(vertexSet());
onPathFromRefSource.retainAll(onPathFromRefSink);
verticesToRemove.removeAll(onPathFromRefSource);
removeAllVertices(verticesToRemove);
// simple santity checks that this algorithm is working.
if ( getSinks().size() > 1 ) {
throw new IllegalStateException("Should have eliminated all but the reference sink, but found " + getSinks());
}
if ( getSources().size() > 1 ) {
throw new IllegalStateException("Should have eliminated all but the reference source, but found " + getSources());
}
}
/**
@ -555,11 +573,11 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param <T> the type of the nodes in those graphs
* @return true if g1 and g2 are equals
*/
public static <T extends BaseVertex> boolean graphEquals(final BaseGraph<T> g1, BaseGraph<T> g2) {
public static <T extends BaseVertex, E extends BaseEdge> boolean graphEquals(final BaseGraph<T,E> g1, BaseGraph<T,E> g2) {
final Set<T> vertices1 = g1.vertexSet();
final Set<T> vertices2 = g2.vertexSet();
final Set<BaseEdge> edges1 = g1.edgeSet();
final Set<BaseEdge> edges2 = g2.edgeSet();
final Set<E> edges1 = g1.edgeSet();
final Set<E> edges2 = g2.edgeSet();
if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() )
return false;
@ -571,29 +589,35 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
if ( ! found ) return false;
}
for( final BaseEdge e1 : g1.edgeSet() ) {
for( final E e1 : g1.edgeSet() ) {
boolean found = false;
for( BaseEdge e2 : g2.edgeSet() ) {
if( e1.seqEquals(g1, e2, g2) ) { found = true; break; }
for( E e2 : g2.edgeSet() ) {
if( g1.seqEquals(e1, e2, g2) ) { found = true; break; }
}
if( !found ) { return false; }
}
for( final BaseEdge e2 : g2.edgeSet() ) {
for( final E e2 : g2.edgeSet() ) {
boolean found = false;
for( BaseEdge e1 : g1.edgeSet() ) {
if( e2.seqEquals(g2, e1, g1) ) { found = true; break; }
for( E e1 : g1.edgeSet() ) {
if( g2.seqEquals(e2, e1, g1) ) { found = true; break; }
}
if( !found ) { return false; }
}
return true;
}
// For use when comparing edges across graphs!
private boolean seqEquals( final E edge1, final E edge2, final BaseGraph<V,E> graph2 ) {
return (this.getEdgeSource(edge1).seqEquals(graph2.getEdgeSource(edge2))) && (this.getEdgeTarget(edge1).seqEquals(graph2.getEdgeTarget(edge2)));
}
/**
* Get the incoming edge of v. Requires that there be only one such edge or throws an error
* @param v our vertex
* @return the single incoming edge to v, or null if none exists
*/
public BaseEdge incomingEdgeOf(final T v) {
public E incomingEdgeOf(final V v) {
return getSingletonEdge(incomingEdgesOf(v));
}
@ -602,7 +626,7 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param v our vertex
* @return the single outgoing edge from v, or null if none exists
*/
public BaseEdge outgoingEdgeOf(final T v) {
public E outgoingEdgeOf(final V v) {
return getSingletonEdge(outgoingEdgesOf(v));
}
@ -613,7 +637,7 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @return a edge
*/
@Requires("edges != null")
private BaseEdge getSingletonEdge(final Collection<BaseEdge> edges) {
private E getSingletonEdge(final Collection<E> edges) {
if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges);
return edges.isEmpty() ? null : edges.iterator().next();
}
@ -625,12 +649,19 @@ public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, Bas
* @param target vertex
* @param e edge to add
*/
public void addOrUpdateEdge(final T source, final T target, final BaseEdge e) {
final BaseEdge prev = getEdge(source, target);
public void addOrUpdateEdge(final V source, final V target, final E e) {
final E prev = getEdge(source, target);
if ( prev != null ) {
prev.add(e);
} else {
addEdge(source, target, e);
}
}
@Override
public String toString() {
return "BaseGraph{" +
"kmerSize=" + kmerSize +
'}';
}
}

View File

@ -60,10 +60,10 @@ import java.util.LinkedList;
* Date: 3/24/13
* Time: 4:41 PM
*/
public class BaseGraphIterator<T extends BaseVertex> implements Iterator<T>, Iterable<T> {
public class BaseGraphIterator<T extends BaseVertex, E extends BaseEdge> implements Iterator<T>, Iterable<T> {
final HashSet<T> visited = new HashSet<T>();
final LinkedList<T> toVisit = new LinkedList<T>();
final BaseGraph<T> graph;
final BaseGraph<T,E> graph;
final boolean followIncomingEdges, followOutgoingEdges;
/**
@ -78,7 +78,7 @@ public class BaseGraphIterator<T extends BaseVertex> implements Iterator<T>, Ite
* traversal? (goes backward through the graph)
* @param followOutgoingEdges should we follow outgoing edges during out traversal?
*/
public BaseGraphIterator(final BaseGraph<T> graph, final T start,
public BaseGraphIterator(final BaseGraph<T,E> graph, final T start,
final boolean followIncomingEdges, final boolean followOutgoingEdges) {
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
if ( start == null ) throw new IllegalArgumentException("start cannot be null");

View File

@ -57,6 +57,8 @@ import java.util.Arrays;
* @since 03/2013
*/
public class BaseVertex {
/** placeholder to store additional information for debugging purposes */
String additionalInfo = "";
final byte[] sequence;
private final static int UNASSIGNED_HASHCODE = -1;
int cachedHashCode = UNASSIGNED_HASHCODE;
@ -176,4 +178,18 @@ public class BaseVertex {
public byte[] getAdditionalSequence(final boolean source) {
return getSequence();
}
/**
* Set additional debugging information for this vertex
* @param info
*/
public void setAdditionalInfo(final String info) {
if ( info == null ) throw new IllegalArgumentException("info cannot be null");
additionalInfo = info;
}
/**
* @return the additional information for display about this vertex
*/
public String additionalInfo() { return additionalInfo; }
}

View File

@ -126,10 +126,10 @@ public class CommonSuffixSplitter {
edgesToRemove.add(out);
}
graph.addEdge(suffixV, graph.getEdgeTarget(out), new BaseEdge(out));
graph.addEdge(suffixV, graph.getEdgeTarget(out), out.copy());
for ( final BaseEdge in : graph.incomingEdgesOf(mid) ) {
graph.addEdge(graph.getEdgeSource(in), incomingTarget, new BaseEdge(in));
graph.addEdge(graph.getEdgeSource(in), incomingTarget, in.copy());
edgesToRemove.add(in);
}
}

View File

@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import com.google.java.contract.Ensures;
import org.jgrapht.EdgeFactory;
import java.util.Arrays;
import java.util.HashMap;
@ -58,12 +59,22 @@ import java.util.Map;
* User: rpoplin
* Date: 2/6/13
*/
public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex> {
public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex, BaseEdge> {
/**
* Edge factory that creates non-reference multiplicity 1 edges
*/
private static class MyEdgeFactory implements EdgeFactory<DeBruijnVertex, BaseEdge> {
@Override
public BaseEdge createEdge(DeBruijnVertex sourceVertex, DeBruijnVertex targetVertex) {
return new BaseEdge(false, 1);
}
}
/**
* Create an empty DeBruijnGraph with default kmer size
*/
public DeBruijnGraph() {
super();
this(11);
}
/**
@ -71,7 +82,7 @@ public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex> {
* @param kmerSize kmer size, must be >= 1
*/
public DeBruijnGraph(int kmerSize) {
super(kmerSize);
super(kmerSize, new MyEdgeFactory());
}
/**

View File

@ -54,7 +54,7 @@ import com.google.java.contract.Ensures;
* User: ebanks, mdepristo
* Date: Mar 23, 2011
*/
public final class DeBruijnVertex extends BaseVertex {
public class DeBruijnVertex extends BaseVertex {
private final static byte[][] sufficesAsByteArray = new byte[256][];
static {
for ( int i = 0; i < sufficesAsByteArray.length; i++ )

View File

@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
import java.util.ArrayList;
import java.util.Collection;
@ -60,7 +61,7 @@ import java.util.List;
* Date: 3/25/13
* Time: 9:42 PM
*/
final class GraphUtils {
final public class GraphUtils {
private GraphUtils() {}
/**
@ -135,4 +136,49 @@ final class GraphUtils {
return min;
}
/**
* Find the ending position of the longest uniquely matching
* run of bases of kmer in seq.
*
* for example, if seq = ACGT and kmer is NAC, this function returns 1,2 as we have the following
* match:
*
* 0123
* .ACGT
* NAC..
*
* @param seq a non-null sequence of bytes
* @param kmer a non-null kmer
* @return the ending position and length where kmer matches uniquely in sequence, or null if no
* unique longest match can be found
*/
public static PrimitivePair.Int findLongestUniqueSuffixMatch(final byte[] seq, final byte[] kmer) {
int longestPos = -1;
int length = 0;
boolean foundDup = false;
for ( int i = 0; i < seq.length; i++ ) {
final int matchSize = longestSuffixMatch(seq, kmer, i);
if ( matchSize > length ) {
longestPos = i;
length = matchSize;
foundDup = false;
} else if ( matchSize == length ) {
foundDup = true;
}
}
return foundDup ? null : new PrimitivePair.Int(longestPos, length);
}
private static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
for ( int len = 1; len <= kmer.length; len++ ) {
final int seqI = seqStart - len + 1;
final int kmerI = kmer.length - len;
if ( seqI < 0 || seq[seqI] != kmer[kmerI] ) {
return len - 1;
}
}
return kmer.length;
}
}

View File

@ -59,7 +59,7 @@ import java.util.*;
* User: ebanks, rpoplin, mdepristo
* Date: Mar 23, 2011
*/
public class KBestPaths<T extends BaseVertex> {
public class KBestPaths<T extends BaseVertex, E extends BaseEdge> {
private final boolean allowCycles;
/**
@ -93,7 +93,7 @@ public class KBestPaths<T extends BaseVertex> {
/**
* @see #getKBestPaths(BaseGraph, int) retriving the best 1000 paths
*/
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph ) {
public List<Path<T,E>> getKBestPaths( final BaseGraph<T, E> graph ) {
return getKBestPaths(graph, 1000);
}
@ -101,28 +101,28 @@ public class KBestPaths<T extends BaseVertex> {
* @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) retriving the first 1000 paths
* starting from all source vertices and ending with all sink vertices
*/
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final int k ) {
public List<Path<T,E>> getKBestPaths( final BaseGraph<T,E> graph, final int k ) {
return getKBestPaths(graph, k, graph.getSources(), graph.getSinks());
}
/**
* @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000
*/
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final Set<T> sources, final Set<T> sinks ) {
public List<Path<T,E>> getKBestPaths( final BaseGraph<T,E> graph, final Set<T> sources, final Set<T> sinks ) {
return getKBestPaths(graph, 1000, sources, sinks);
}
/**
* @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000
*/
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final T source, final T sink ) {
public List<Path<T,E>> getKBestPaths( final BaseGraph<T,E> graph, final T source, final T sink ) {
return getKBestPaths(graph, 1000, source, sink);
}
/**
* @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with singleton source and sink sets
*/
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final int k, final T source, final T sink ) {
public List<Path<T,E>> getKBestPaths( final BaseGraph<T,E> graph, final int k, final T source, final T sink ) {
return getKBestPaths(graph, k, Collections.singleton(source), Collections.singleton(sink));
}
@ -136,20 +136,20 @@ public class KBestPaths<T extends BaseVertex> {
* @return a list with at most k top-scoring paths from the graph
*/
@Ensures({"result != null", "result.size() <= k"})
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final int k, final Set<T> sources, final Set<T> sinks ) {
public List<Path<T,E>> getKBestPaths( final BaseGraph<T,E> graph, final int k, final Set<T> sources, final Set<T> sinks ) {
if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); }
// a min max queue that will collect the best k paths
final MinMaxPriorityQueue<Path<T>> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create();
final MinMaxPriorityQueue<Path<T,E>> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create();
// run a DFS for best paths
for ( final T source : sources ) {
final Path<T> startingPath = new Path<T>(source, graph);
final Path<T,E> startingPath = new Path<T,E>(source, graph);
findBestPaths(startingPath, sinks, bestPaths, new MyInt());
}
// the MinMaxPriorityQueue iterator returns items in an arbitrary order, so we need to sort the final result
final List<Path<T>> toReturn = new ArrayList<Path<T>>(bestPaths);
final List<Path<T,E>> toReturn = new ArrayList<Path<T,E>>(bestPaths);
Collections.sort(toReturn, new PathComparatorTotalScore());
return toReturn;
}
@ -161,21 +161,21 @@ public class KBestPaths<T extends BaseVertex> {
* @param bestPaths a path to collect completed paths.
* @param n used to limit the search by tracking the number of vertices visited across all paths
*/
private void findBestPaths( final Path<T> path, final Set<T> sinks, final Collection<Path<T>> bestPaths, final MyInt n ) {
private void findBestPaths( final Path<T,E> path, final Set<T> sinks, final Collection<Path<T,E>> bestPaths, final MyInt n ) {
if ( sinks.contains(path.getLastVertex())) {
bestPaths.add(path);
} else if( n.val > 10000 ) {
// do nothing, just return, as we've done too much work already
} else {
// recursively run DFS
final ArrayList<BaseEdge> edgeArrayList = new ArrayList<BaseEdge>(path.getOutgoingEdgesOfLastVertex());
final ArrayList<E> edgeArrayList = new ArrayList<E>(path.getOutgoingEdgesOfLastVertex());
Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator());
for ( final BaseEdge edge : edgeArrayList ) {
for ( final E edge : edgeArrayList ) {
final T target = path.getGraph().getEdgeTarget(edge);
// make sure the edge is not already in the path
final boolean alreadyVisited = allowCycles ? path.containsEdge(edge) : path.containsVertex(target);
if ( ! alreadyVisited ) {
final Path<T> newPath = new Path<T>(path, edge);
final Path<T,E> newPath = new Path<T,E>(path, edge);
n.val++;
findBestPaths(newPath, sinks, bestPaths, n);
}

View File

@ -0,0 +1,170 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import java.util.*;
/**
/**
* Prune all chains from this graph where all edges in the path have multiplicity <= pruneFactor
*
* Unlike pruneGraph, this function will remove only linear chains in the graph where all edges have weight <= pruneFactor.
*
* For A -[1]> B -[1]> C -[1]> D would be removed with pruneFactor 1
* but A -[1]> B -[2]> C -[1]> D would not be because the linear chain includes an edge with weight >= 2
*
* User: depristo
* Date: 5/2/13
* Time: 10:38 AM
*/
public class LowWeightChainPruner<V extends BaseVertex, E extends BaseEdge> {
private final int pruneFactor;
public LowWeightChainPruner(int pruneFactor) {
if ( pruneFactor < 0 ) throw new IllegalArgumentException("pruneFactor must be >= 0 but got " + pruneFactor);
this.pruneFactor = pruneFactor;
}
/**
* Prune graph
* @param graph the graph to prune
*/
public void pruneLowWeightChains(final BaseGraph<V,E> graph) {
if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null");
if ( pruneFactor > 0 ) {
final Set<E> edgesToKeep = new LinkedHashSet<>();
for ( final Path<V,E> linearChain : getLinearChains(graph) ) {
if( mustBeKeep(linearChain, pruneFactor) ) {
// we must keep edges in any path that contains a reference edge or an edge with weight > pruneFactor
edgesToKeep.addAll(linearChain.getEdges());
}
}
// we want to remove all edges not in the keep set
final Set<E> edgesToRemove = new HashSet<>(graph.edgeSet());
edgesToRemove.removeAll(edgesToKeep);
graph.removeAllEdges(edgesToRemove);
graph.removeSingletonOrphanVertices();
}
}
/**
* Get the maximum pruning multiplicity seen on any edge in this graph
* @return an integer > 0
*/
private boolean mustBeKeep(final Path<V,E> path, final int pruneFactor) {
for ( final E edge : path.getEdges() ) {
if ( edge.getPruningMultiplicity() >= pruneFactor || edge.isRef() )
return true;
}
return false;
}
/**
* Get all of the linear chains in graph
*
* A linear chain is a series of vertices that start from either a source of a vertex with
* out-degree > 1 and extend through all vertices accessible via an outgoing edge from this
* vertex that have in == 1 and out degree of 0 or 1.
*
* @param graph the graph
* @return a non-null collection of paths in graph
*/
protected final Collection<Path<V,E>> getLinearChains(final BaseGraph<V,E> graph) {
final Set<V> chainStarts = new LinkedHashSet<>();
for ( final V v : graph.vertexSet() ) {
// we want a list of all chain start vertices. These are all vertices with out
// degree > 1, or all source vertices.
final int outDegree = graph.outDegreeOf(v);
final int inDegree = graph.inDegreeOf(v);
if ( outDegree > 1 || inDegree > 1 || (inDegree == 0 && outDegree > 0)) // don't add isolated vertices
chainStarts.add(v);
}
// must be after since we can add duplicate starts in the above finding algorithm
final List<Path<V, E>> linearChains = new LinkedList<>();
for ( final V chainStart : chainStarts ) {
for ( final E outEdge : graph.outgoingEdgesOf(chainStart) ) {
// these chains are composed of the starts + their next vertices
linearChains.add(extendLinearChain(new Path<>(new Path<>(chainStart, graph), outEdge)));
}
}
return linearChains;
}
/**
* Extend path while the last vertex has in and out degrees of 1 or 0
* @param path the path to extend
* @return a fully extended linear path
*/
protected final Path<V,E> extendLinearChain(final Path<V, E> path) {
final V last = path.getLastVertex();
final Set<E> outEdges = path.getGraph().outgoingEdgesOf(last);
final int outDegree = outEdges.size();
final int inDegree = path.getGraph().inDegreeOf(last);
if ( outDegree != 1 || inDegree > 1 ) {
// out next vertex has multiple outgoing edges, so we are done with the linear path
return path;
} else {
final V next = path.getGraph().getEdgeTarget(outEdges.iterator().next());
if ( path.containsVertex(next) ) {
// we are done if the path contains a cycle
return path;
} else {
// we now know that last has outdegree == 1, so we keep extending the chain
return extendLinearChain(new Path<>(path, outEdges.iterator().next()));
}
}
}
}

View File

@ -0,0 +1,123 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
/**
* edge class for connecting nodes in the graph that tracks some per-sample information
*
* This class extends BaseEdge with the additional functionality of tracking the maximum
* multiplicity seen within any single sample. The workflow for using this class is:
*
* MultiSampleEdge e = new MultiSampleEdge(ref, 1)
* e.incMultiplicity(1) // total is 2, per sample is 2, max per sample is 1
* e.getPruningMultiplicity() // = 1
* e.flushSingleSampleMultiplicity() // total is 2, per sample is 0, max per sample is 2
* e.getPruningMultiplicity() // = 2
* e.incMultiplicity(3) // total is 5, per sample is 3, max per sample is 2
* e.getPruningMultiplicity() // = 2
* e.flushSingleSampleMultiplicity() // total is 5, per sample is 0, max per sample is 3
* e.getPruningMultiplicity() // = 3
*/
public class MultiSampleEdge extends BaseEdge {
private int maxSingleSampleMultiplicity, currentSingleSampleMultiplicity;
/**
* Create a new MultiSampleEdge with weight multiplicity and, if isRef == true, indicates a path through the reference
*
* @param isRef indicates whether this edge is a path through the reference
* @param multiplicity the number of observations of this edge in this sample
*/
public MultiSampleEdge(final boolean isRef, final int multiplicity) {
super(isRef, multiplicity);
maxSingleSampleMultiplicity = multiplicity;
currentSingleSampleMultiplicity = multiplicity;
}
@Override
public MultiSampleEdge copy() {
return new MultiSampleEdge(isRef(), getMultiplicity()); // TODO -- should I copy values for other features?
}
/**
* update the max single sample multiplicity based on the current single sample multiplicity, and
* reset the current single sample multiplicity to 0.
*/
public void flushSingleSampleMultiplicity() {
if ( currentSingleSampleMultiplicity > maxSingleSampleMultiplicity )
maxSingleSampleMultiplicity = currentSingleSampleMultiplicity;
currentSingleSampleMultiplicity = 0;
}
@Override
public void incMultiplicity(final int incr) {
super.incMultiplicity(incr);
currentSingleSampleMultiplicity += incr;
}
@Override
public int getPruningMultiplicity() {
return getMaxSingleSampleMultiplicity();
}
@Override
public String getDotLabel() {
return super.getDotLabel() + "/" + getMaxSingleSampleMultiplicity();
}
/**
* Get the maximum multiplicity for this edge seen in any single sample
* @return an integer >= 0
*/
public int getMaxSingleSampleMultiplicity() {
return maxSingleSampleMultiplicity;
}
/** only provided for testing purposes */
protected int getCurrentSingleSampleMultiplicity() {
return currentSingleSampleMultiplicity;
}
}

View File

@ -52,8 +52,8 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.smithwaterman.Parameters;
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.smithwaterman.*;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import java.util.*;
@ -68,40 +68,39 @@ import java.util.*;
* Time: 2:34 PM
*
*/
public class Path<T extends BaseVertex> {
private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20;
public class Path<T extends BaseVertex, E extends BaseEdge> {
private final static String SW_PAD = "NNNNNNNNNN";
private final static Logger logger = Logger.getLogger(Path.class);
// the last vertex seen in the path
private final T lastVertex;
// the list of edges comprising the path
private Set<BaseEdge> edgesAsSet = null;
private final LinkedList<BaseEdge> edgesInOrder;
private Set<E> edgesAsSet = null;
private final LinkedList<E> edgesInOrder;
// the scores for the path
private final int totalScore;
// the graph from which this path originated
private final BaseGraph<T> graph;
private final BaseGraph<T, E> graph;
// used in the bubble state machine to apply Smith-Waterman to the bubble sequence
// these values were chosen via optimization against the NA12878 knowledge base
public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -15.0, -26.0, -1.1);
private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes();
/**
* Create a new Path containing no edges and starting at initialVertex
* @param initialVertex the starting vertex of the path
* @param graph the graph this path with follow through
*/
public Path(final T initialVertex, final BaseGraph<T> graph) {
public Path(final T initialVertex, final BaseGraph<T, E> graph) {
if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null");
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph);
lastVertex = initialVertex;
edgesInOrder = new LinkedList<BaseEdge>();
edgesInOrder = new LinkedList<E>();
totalScore = 0;
this.graph = graph;
}
@ -109,10 +108,10 @@ public class Path<T extends BaseVertex> {
/**
* Convenience constructor for testing that creates a path through vertices in graph
*/
protected static <T extends BaseVertex> Path<T> makePath(final List<T> vertices, final BaseGraph<T> graph) {
Path<T> path = new Path<T>(vertices.get(0), graph);
protected static <T extends BaseVertex, E extends BaseEdge> Path<T,E> makePath(final List<T> vertices, final BaseGraph<T, E> graph) {
Path<T,E> path = new Path<T,E>(vertices.get(0), graph);
for ( int i = 1; i < vertices.size(); i++ )
path = new Path<T>(path, graph.getEdge(path.lastVertex, vertices.get(i)));
path = new Path<T,E>(path, graph.getEdge(path.lastVertex, vertices.get(i)));
return path;
}
@ -122,7 +121,7 @@ public class Path<T extends BaseVertex> {
* @param p the path to extend
* @param edge the edge to extend path by
*/
public Path(final Path<T> p, final BaseEdge edge) {
public Path(final Path<T,E> p, final E edge) {
if ( p == null ) throw new IllegalArgumentException("Path cannot be null");
if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null");
if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't");
@ -130,7 +129,7 @@ public class Path<T extends BaseVertex> {
graph = p.graph;
lastVertex = p.graph.getEdgeTarget(edge);
edgesInOrder = new LinkedList<BaseEdge>(p.getEdges());
edgesInOrder = new LinkedList<E>(p.getEdges());
edgesInOrder.add(edge);
totalScore = p.totalScore + edge.getMultiplicity();
}
@ -139,7 +138,7 @@ public class Path<T extends BaseVertex> {
* Get the collection of edges leaving the last vertex of this path
* @return a non-null collection
*/
public Collection<BaseEdge> getOutgoingEdgesOfLastVertex() {
public Collection<E> getOutgoingEdgesOfLastVertex() {
return getGraph().outgoingEdgesOf(getLastVertex());
}
@ -148,12 +147,12 @@ public class Path<T extends BaseVertex> {
* @param edge the given edge to test
* @return true if the edge is found in this path
*/
public boolean containsEdge( final BaseEdge edge ) {
public boolean containsEdge( final E edge ) {
if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); }
if ( edgesInOrder.isEmpty() ) return false;
// initialize contains cache if necessary
if ( edgesAsSet == null ) edgesAsSet = new HashSet<BaseEdge>(edgesInOrder);
if ( edgesAsSet == null ) edgesAsSet = new HashSet<E>(edgesInOrder);
return edgesAsSet.contains(edge);
}
@ -175,7 +174,7 @@ public class Path<T extends BaseVertex> {
* @param path the other path we might be the same as
* @return true if this and path are the same
*/
protected boolean pathsAreTheSame(Path<T> path) {
protected boolean pathsAreTheSame(Path<T,E> path) {
return totalScore == path.totalScore && edgesInOrder.equals(path.edgesInOrder);
}
@ -199,7 +198,7 @@ public class Path<T extends BaseVertex> {
* @return a non-null graph
*/
@Ensures("result != null")
public BaseGraph<T> getGraph() {
public BaseGraph<T, E> getGraph() {
return graph;
}
@ -208,7 +207,7 @@ public class Path<T extends BaseVertex> {
* @return a non-null list of edges
*/
@Ensures("result != null")
public List<BaseEdge> getEdges() { return edgesInOrder; }
public List<E> getEdges() { return edgesInOrder; }
/**
* Get the list of vertices in this path in order defined by the edges of the path
@ -221,7 +220,7 @@ public class Path<T extends BaseVertex> {
else {
final LinkedList<T> vertices = new LinkedList<T>();
boolean first = true;
for ( final BaseEdge e : getEdges() ) {
for ( final E e : getEdges() ) {
if ( first ) {
vertices.add(graph.getEdgeSource(e));
first = false;
@ -246,6 +245,14 @@ public class Path<T extends BaseVertex> {
@Ensures("result != null")
public T getLastVertex() { return lastVertex; }
/**
* Get the first vertex in this path
* @return a non-null vertex
*/
public T getFirstVertex() {
return getGraph().getEdgeSource(edgesInOrder.pollFirst());
}
/**
* The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes
* @return non-null sequence of bases corresponding to this path
@ -255,174 +262,114 @@ public class Path<T extends BaseVertex> {
if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); }
byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.getFirst()));
for( final BaseEdge e : edgesInOrder ) {
for( final E e : edgesInOrder ) {
bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e)));
}
return bases;
}
/**
* Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble
* @return non-null Cigar string with reference length equal to the refHaplotype's reference length
* Calculate the cigar elements for this path against the reference sequence
*
* @param refSeq the reference sequence that all of the bases in this path should align to
* @return a Cigar mapping this path to refSeq, or null if no reasonable alignment could be found
*/
@Ensures("result != null")
public Cigar calculateCigar() {
final Cigar cigar = new Cigar();
// special case for paths that start on reference but not at the reference source node
if( edgesInOrder.getFirst().isRef() && !graph.isRefSource(edgesInOrder.getFirst()) ) {
for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edgesInOrder.getFirst())).getCigarElements() ) {
cigar.add(ce);
}
public Cigar calculateCigar(final byte[] refSeq) {
if ( getBases().length == 0 ) {
// horrible edge case from the unit tests, where this path has no bases
return new Cigar(Arrays.asList(new CigarElement(refSeq.length, CigarOperator.D)));
}
// reset the bubble state machine
final BubbleStateMachine<T> bsm = new BubbleStateMachine<T>(cigar);
final byte[] bases = getBases();
final Cigar nonStandard;
for( final BaseEdge e : getEdges() ) {
if ( e.hasSameSourceAndTarget(graph, edgesInOrder.getFirst()) ) {
advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null );
}
advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e );
final String paddedRef = SW_PAD + new String(refSeq) + SW_PAD;
final String paddedPath = SW_PAD + new String(bases) + SW_PAD;
final SmithWaterman alignment = new SWPairwiseAlignment( paddedRef.getBytes(), paddedPath.getBytes(), NEW_SW_PARAMETERS );
if ( isSWFailure(alignment) )
return null;
// cut off the padding bases
final int baseStart = SW_PAD.length();
final int baseEnd = paddedPath.length() - SW_PAD.length() - 1; // -1 because it's inclusive
nonStandard = AlignmentUtils.trimCigarByBases(alignment.getCigar(), baseStart, baseEnd);
if ( nonStandard.getReferenceLength() != refSeq.length ) {
nonStandard.add(new CigarElement(refSeq.length - nonStandard.getReferenceLength(), CigarOperator.D));
}
// special case for paths that don't end on reference
if( bsm.inBubble ) {
for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) {
bsm.cigar.add(ce);
}
} else if( edgesInOrder.getLast().isRef() && !graph.isRefSink(edgesInOrder.getLast()) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit
for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edgesInOrder.getLast()), null).getCigarElements() ) {
bsm.cigar.add(ce);
}
}
return AlignmentUtils.consolidateCigar(bsm.cigar);
// finally, return the cigar with all indels left aligned
return leftAlignCigarSequentially(nonStandard, refSeq, getBases(), 0, 0);
}
/**
* Advance the bubble state machine by incorporating the next node in the path.
* @param bsm the current bubble state machine
* @param node the node to be incorporated
* @param e the edge which generated this node in the path
* Make sure that the SW didn't fail in some terrible way, and throw exception if it did
*/
@Requires({"bsm != null", "graph != null", "node != null"})
private void advanceBubbleStateMachine( final BubbleStateMachine<T> bsm, final T node, final BaseEdge e ) {
if( graph.isReferenceNode( node ) ) {
if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else
if( e !=null && !e.isRef() ) {
if( graph.referencePathExists( graph.getEdgeSource(e), node) ) {
for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) {
bsm.cigar.add(ce);
}
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
} else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) );
} else {
bsm.inBubble = true;
bsm.bubbleBytes = null;
bsm.lastSeenReferenceNode = graph.getEdgeSource(e);
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
}
} else {
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
}
} else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
} else { // close the bubble and use a local SW to determine the Cigar string
for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) {
bsm.cigar.add(ce);
}
bsm.inBubble = false;
bsm.bubbleBytes = null;
bsm.lastSeenReferenceNode = null;
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
}
} else { // non-ref vertex
if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
} else { // open up a bubble
bsm.inBubble = true;
bsm.bubbleBytes = null;
bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null );
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
}
private boolean isSWFailure(final SmithWaterman alignment) {
// check that the alignment starts at the first base, which it should given the padding
if ( alignment.getAlignmentStart2wrt1() > 0 ) {
return true;
// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should always start at 0, but got " + alignment.getAlignmentStart2wrt1() + " with cigar " + alignment.getCigar());
}
// check that we aren't getting any S operators (which would be very bad downstream)
for ( final CigarElement ce : alignment.getCigar().getCigarElements() ) {
if ( ce.getOperator() == CigarOperator.S )
return true;
// soft clips at the end of the alignment are really insertions
// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should never contain S operators but got cigar " + alignment.getCigar());
}
return false;
}
/**
* Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble
* @param bubbleBytes the bytes that comprise the alternate allele path in this bubble
* @param fromVertex the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex)
* @param toVertex the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex)
* @return the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble
* Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them.
* This is a target of future work to incorporate and generalize into AlignmentUtils for use by others.
* @param cigar the cigar to left align
* @param refSeq the reference byte array
* @param readSeq the read byte array
* @param refIndex 0-based alignment start position on ref
* @param readIndex 0-based alignment start position on read
* @return the left-aligned cigar
*/
@Requires({"graph != null"})
@Ensures({"result != null"})
private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) {
final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null);
final Cigar returnCigar = new Cigar();
// add padding to anchor ref/alt bases in the SW matrix
byte[] padding = STARTING_SW_ANCHOR_BYTES;
boolean goodAlignment = false;
SWPairwiseAlignment swConsensus = null;
while( !goodAlignment && padding.length < 1000 ) {
padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time
final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding );
final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding );
swConsensus = new SWPairwiseAlignment( reference, alternate, NEW_SW_PARAMETERS );
if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) {
goodAlignment = true;
@Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"})
protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
final Cigar cigarToReturn = new Cigar();
Cigar cigarToAlign = new Cigar();
for (int i = 0; i < cigar.numCigarElements(); i++) {
final CigarElement ce = cigar.getCigarElement(i);
if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) {
cigarToAlign.add(ce);
final Cigar leftAligned = AlignmentUtils.leftAlignSingleIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false);
for ( final CigarElement toAdd : leftAligned.getCigarElements() ) { cigarToReturn.add(toAdd); }
refIndex += cigarToAlign.getReferenceLength();
readIndex += cigarToAlign.getReadLength();
cigarToAlign = new Cigar();
} else {
cigarToAlign.add(ce);
}
}
if( !goodAlignment ) {
returnCigar.add(new CigarElement(1, CigarOperator.N));
return returnCigar;
}
final Cigar swCigar = swConsensus.getCigar();
if( swCigar.numCigarElements() > MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW ) { // this bubble is too divergent from the reference
returnCigar.add(new CigarElement(1, CigarOperator.N));
} else {
for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) {
// now we need to remove the padding from the cigar string
int length = swCigar.getCigarElement(iii).getLength();
if( iii == 0 ) { length -= padding.length; }
if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; }
if( length > 0 ) {
returnCigar.add(new CigarElement(length, swCigar.getCigarElement(iii).getOperator()));
}
}
if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) {
throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar());
if( !cigarToAlign.isEmpty() ) {
for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) {
cigarToReturn.add(toAdd);
}
}
return returnCigar;
final Cigar result = AlignmentUtils.consolidateCigar(cigarToReturn);
if( result.getReferenceLength() != cigar.getReferenceLength() )
throw new IllegalStateException("leftAlignCigarSequentially failed to produce a valid CIGAR. Reference lengths differ. Initial cigar " + cigar + " left aligned into " + result);
return result;
}
// class to keep track of the bubble state machine
private static class BubbleStateMachine<T extends BaseVertex> {
public boolean inBubble = false;
public byte[] bubbleBytes = null;
public T lastSeenReferenceNode = null;
public Cigar cigar = null;
public BubbleStateMachine( final Cigar initialCigar ) {
inBubble = false;
bubbleBytes = null;
lastSeenReferenceNode = null;
cigar = initialCigar;
}
}
/**
* Tests that this and other have the same score and vertices in the same order with the same seq
* @param other the other path to consider. Cannot be null
* @return true if this and path are equal, false otherwise
*/
public boolean equalScoreAndSequence(final Path<T> other) {
public boolean equalScoreAndSequence(final Path<T,E> other) {
if ( other == null ) throw new IllegalArgumentException("other cannot be null");
return getScore() == other.getScore() && equalSequence(other);
}
@ -432,7 +379,7 @@ public class Path<T extends BaseVertex> {
* @param other the other path to consider. Cannot be null
* @return true if this and path are equal, false otherwise
*/
public boolean equalSequence(final Path<T> other) {
public boolean equalSequence(final Path<T,E> other) {
final List<T> mine = getVertices();
final List<T> yours = other.getVertices();
if ( mine.size() == yours.size() ) { // hehehe

View File

@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.jgrapht.EdgeFactory;
import java.io.File;
import java.util.HashSet;
@ -61,7 +62,17 @@ import java.util.Set;
* @author: depristo
* @since 03/2013
*/
public final class SeqGraph extends BaseGraph<SeqVertex> {
public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
/**
* Edge factory that creates non-reference multiplicity 1 edges
*/
private static class MyEdgeFactory implements EdgeFactory<SeqVertex, BaseEdge> {
@Override
public BaseEdge createEdge(SeqVertex sourceVertex, SeqVertex targetVertex) {
return new BaseEdge(false, 1);
}
}
private final static boolean PRINT_SIMPLIFY_GRAPHS = false;
/**
@ -82,7 +93,7 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
* Construct an empty SeqGraph
*/
public SeqGraph() {
super();
this(11);
}
/**
@ -94,7 +105,7 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
* @param kmer kmer
*/
public SeqGraph(final int kmer) {
super(kmer);
super(kmer, new MyEdgeFactory());
}
/**
@ -154,7 +165,6 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0);
didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete();
didSomeWork |= zipLinearChains();
return didSomeWork;
}
@ -289,8 +299,8 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
final BaseEdge inc = new BaseEdge(false, sharedWeightAmongEdges); // template to make .add function call easy
// update the incoming and outgoing edges to point to the new vertex
for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge).add(inc)); }
for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge).add(inc)); }
for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), edge.copy().add(inc)); }
for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, edge.copy().add(inc)); }
removeAllVertices(linearChain);
return true;
@ -505,40 +515,4 @@ public final class SeqGraph extends BaseGraph<SeqVertex> {
}
}
}
/**
* Merge headless configurations:
*
* Performs the transformation:
*
* { x + S_i + y -> Z }
*
* goes to:
*
* { x -> S_i -> y -> Z }
*
* for all nodes that match this configuration.
*
* Differs from the diamond transform in that no top node is required
*/
protected class MergeHeadlessIncomingSources extends VertexBasedTransformer {
@Override
boolean tryToTransform(final SeqVertex bottom) {
final Set<SeqVertex> incoming = incomingVerticesOf(bottom);
if ( incoming.size() <= 1 )
return false;
for ( final SeqVertex inc : incoming )
if ( ! isSource(inc) || outDegreeOf(inc) > 1 )
return false;
if ( dontModifyGraphEvenIfPossible() ) return true;
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming);
if (splitter.meetsMinMergableSequenceForPrefix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES))
return splitter.splitAndUpdate(null, bottom);
else
return false;
}
}
}

View File

@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.Utils;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicInteger;
/**
* A graph vertex containing a sequence of bases and a unique ID that
@ -71,8 +72,9 @@ import java.util.Arrays;
* @since 03/2013
*/
public final class SeqVertex extends BaseVertex {
private static int idCounter = 0;
public final int id;
// Note that using an AtomicInteger is critical to allow multi-threaded HaplotypeCaller
private static final AtomicInteger idCounter = new AtomicInteger(0);
private int id = idCounter.getAndIncrement();
/**
* Create a new SeqVertex with sequence and the next available id
@ -80,7 +82,6 @@ public final class SeqVertex extends BaseVertex {
*/
public SeqVertex(final byte[] sequence) {
super(sequence);
this.id = idCounter++;
}
/**
@ -89,7 +90,6 @@ public final class SeqVertex extends BaseVertex {
*/
public SeqVertex(final String sequence) {
super(sequence);
this.id = idCounter++;
}
/**

View File

@ -88,13 +88,13 @@ public class SharedSequenceMerger {
for ( final SeqVertex prev : prevs ) {
for ( final BaseEdge prevIn : graph.incomingEdgesOf(prev) ) {
graph.addEdge(graph.getEdgeSource(prevIn), newV, new BaseEdge(prevIn));
graph.addEdge(graph.getEdgeSource(prevIn), newV, prevIn.copy());
edgesToRemove.add(prevIn);
}
}
for ( final BaseEdge e : graph.outgoingEdgesOf(v) ) {
graph.addEdge(newV, graph.getEdgeTarget(e), new BaseEdge(e));
graph.addEdge(newV, graph.getEdgeTarget(e), e.copy());
}
graph.removeAllVertices(prevs);

View File

@ -209,7 +209,7 @@ public class SharedVertexSequenceSplitter {
splitGraph.addEdge(remaining, suffixV, fromMid);
} else {
// prefix + suffix completely explain this node
splitGraph.addOrUpdateEdge(prefixV, suffixV, new BaseEdge(toMid).add(fromMid));
splitGraph.addOrUpdateEdge(prefixV, suffixV, toMid.copy().add(fromMid));
}
}
}
@ -323,7 +323,7 @@ public class SharedVertexSequenceSplitter {
} else {
// schedule edge for removal, and return a freshly allocated one for our graph to use
edgesToRemove.add(e);
return new BaseEdge(e);
return e.copy();
}
}
}

View File

@ -0,0 +1,121 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnVertex;
import org.broadinstitute.sting.utils.Utils;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
/**
* A DeBruijnVertex that supports multiple copies of the same kmer
*
* This is implemented through the same mechanism as SeqVertex, where each
* created MultiDeBruijnVertex has a unique id assigned upon creation. Two
* MultiDeBruijnVertex are equal iff they have the same ID
*
* User: depristo
* Date: 4/17/13
* Time: 3:20 PM
*/
final class MultiDeBruijnVertex extends DeBruijnVertex {
private final static boolean KEEP_TRACK_OF_READS = false;
// Note that using an AtomicInteger is critical to allow multi-threaded HaplotypeCaller
private static final AtomicInteger idCounter = new AtomicInteger(0);
private int id = idCounter.getAndIncrement();
private final List<String> reads = new LinkedList<String>();
/**
* Create a new MultiDeBruijnVertex with kmer sequence
* @param sequence the kmer sequence
*/
MultiDeBruijnVertex(byte[] sequence) {
super(sequence);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
MultiDeBruijnVertex that = (MultiDeBruijnVertex) o;
return id == that.id;
}
@Override
public String toString() {
return "MultiDeBruijnVertex_id_" + id + "_seq_" + getSequenceString();
}
/**
* Add name information to this vertex for debugging
*
* This information will be captured as a list of strings, and displayed in DOT if this
* graph is written out to disk
*
* This functionality is only enabled when KEEP_TRACK_OF_READS is true
*
* @param name a non-null string
*/
protected void addRead(final String name) {
if ( name == null ) throw new IllegalArgumentException("name cannot be null");
if ( KEEP_TRACK_OF_READS ) reads.add(name);
}
@Override
public int hashCode() { return id; }
@Override
public String additionalInfo() {
return KEEP_TRACK_OF_READS ? (! reads.contains("ref") ? "__" + Utils.join(",", reads) : "") : "";
}
}

View File

@ -0,0 +1,162 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
public class ReadThreadingAssembler extends LocalAssemblyEngine {
private final static Logger logger = Logger.getLogger(ReadThreadingAssembler.class);
private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
/** The min and max kmer sizes to try when building the graph. */
private final List<Integer> kmerSizes;
private final int maxAllowedPathsForReadThreadingAssembler;
private boolean requireReasonableNumberOfPaths = false;
protected boolean removePathsNotConnectedToRef = true;
private boolean justReturnRawGraph = false;
/** for testing only */
public ReadThreadingAssembler() {
this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25));
}
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
super(maxAllowedPathsForReadThreadingAssembler);
this.kmerSizes = kmerSizes;
this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler;
}
/** for testing purposes */
protected void setJustReturnRawGraph(boolean justReturnRawGraph) {
this.justReturnRawGraph = justReturnRawGraph;
}
@Override
public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype) {
final List<SeqGraph> graphs = new LinkedList<>();
for ( final int kmerSize : kmerSizes ) {
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
// add the reference sequence to the graph
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
rtgraph.addRead(read);
}
// actually build the read threading graph
rtgraph.buildGraphIfNecessary();
if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.0.raw_readthreading_graph.dot"), pruneFactor);
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
rtgraph.pruneLowWeightChains(pruneFactor);
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
// we can recover them by merging some N bases from the chain back into the reference uniquely, for
// N < kmerSize
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
// remove all heading and trailing paths
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"), pruneFactor);
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph);
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
if ( debugGraphTransformations ) initialSeqGraph.printGraph(new File("sequenceGraph.0.2.initial_seqgraph.dot"), pruneFactor);
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
if ( seqGraph != null ) {
if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) {
graphs.add(seqGraph);
}
}
}
return graphs;
}
/**
* Did we find a reasonable number of paths in this graph?
* @param graph
* @return
*/
private boolean reasonableNumberOfPaths(final SeqGraph graph) {
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<SeqVertex,BaseEdge>(false);
final List<Path<SeqVertex,BaseEdge>> allPaths = pathFinder.getKBestPaths(graph, 100000);
logger.info("Found " + allPaths.size() + " paths through " + graph + " with maximum " + maxAllowedPathsForReadThreadingAssembler);
return allPaths.size() <= maxAllowedPathsForReadThreadingAssembler;
}
@Override
public String toString() {
return "ReadThreadingAssembler{" +
"kmerSizes=" + kmerSizes +
'}';
}
}

View File

@ -0,0 +1,640 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.jgrapht.EdgeFactory;
import java.io.File;
import java.util.*;
public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> {
/**
* Edge factory that creates non-reference multiplicity 1 edges
*/
private static class MyEdgeFactory implements EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> {
@Override
public MultiSampleEdge createEdge(MultiDeBruijnVertex sourceVertex, MultiDeBruijnVertex targetVertex) {
return new MultiSampleEdge(false, 1);
}
}
private final static Logger logger = Logger.getLogger(ReadThreadingGraph.class);
private final static String ANONYMOUS_SAMPLE = "XXX_UNNAMED_XXX";
private final static boolean WRITE_GRAPH = false;
private final static boolean DEBUG_NON_UNIQUE_CALC = false;
/** for debugging info printing */
private static int counter = 0;
/** we require at least this many bases to be uniquely matching to merge a dangling tail */
private final static int MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL = 5;
/**
* Sequences added for read threading before we've actually built the graph
*/
private final Map<String, List<SequenceForKmers>> pending = new LinkedHashMap<String, List<SequenceForKmers>>();
/**
* A set of non-unique kmers that cannot be used as merge points in the graph
*/
private Set<Kmer> nonUniqueKmers;
/**
* A map from kmers -> their corresponding vertex in the graph
*/
private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<Kmer, MultiDeBruijnVertex>();
/**
*
*/
final int kmerSize;
final boolean debugGraphTransformations;
final byte minBaseQualityToUseInAssembly;
protected boolean increaseCountsBackwards = true;
protected boolean increaseCountsThroughBranches = false; // this may increase the branches without bounds
// --------------------------------------------------------------------------------
// state variables, initialized in resetToInitialState()
// --------------------------------------------------------------------------------
private Kmer refSource;
private boolean alreadyBuilt;
byte[] refSeq;
MultiDeBruijnVertex[] refKmers;
public ReadThreadingGraph() {
this(25, false, (byte)6);
}
public ReadThreadingGraph(final int kmerSize) {
this(kmerSize, false, (byte)6);
}
/**
* Create a new ReadThreadingAssembler using kmerSize for matching
* @param kmerSize must be >= 1
*/
protected ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly) {
super(kmerSize, new MyEdgeFactory());
if ( kmerSize < 1 ) throw new IllegalArgumentException("bad minkKmerSize " + kmerSize);
this.kmerSize = kmerSize;
this.debugGraphTransformations = debugGraphTransformations;
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
resetToInitialState();
}
/**
* Reset this assembler to its initial state, so we can create another assembly with a different set of reads
*/
private void resetToInitialState() {
pending.clear();
nonUniqueKmers = null;
uniqueKmers.clear();
refSource = null;
alreadyBuilt = false;
refSeq = null;
refKmers = null;
}
/**
* Add the all bases in sequence to the graph
* @param sequence a non-null sequence
* @param isRef is this the reference sequence?
*/
protected void addSequence(final byte[] sequence, final boolean isRef) {
addSequence("anonymous", sequence, null, isRef);
}
/**
* Add all bases in sequence to this graph
*
* @see #addSequence(String, String, byte[], int, int, int[], boolean) for full information
*/
public void addSequence(final String seqName, final byte[] sequence, final int[] counts, final boolean isRef) {
addSequence(seqName, ANONYMOUS_SAMPLE, sequence, 0, sequence.length, counts, isRef);
}
/**
* Add bases in sequence to this graph
*
* @param seqName a useful seqName for this read, for debugging purposes
* @param sequence non-null sequence of bases
* @param counts a vector of counts for each bases, indicating how many times that base was observed in the sequence.
* This allows us to support reduced reads in the ReadThreadingAssembler. Can be null, meaning that
* each base is only observed once. If not null, must have length == sequence.length.
* @param start the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive
* @param stop the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive
* @param isRef is this the reference sequence.
*/
public void addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int[] counts, final boolean isRef) {
// note that argument testing is taken care of in SequenceForKmers
if ( alreadyBuilt ) throw new IllegalStateException("Graph already built");
// get the list of sequences for this sample
List<SequenceForKmers> sampleSequences = pending.get(sampleName);
if ( sampleSequences == null ) { // need to create
sampleSequences = new LinkedList<>();
pending.put(sampleName, sampleSequences);
}
// add the new sequence to the list of sequences for sample
sampleSequences.add(new SequenceForKmers(seqName, sequence, start, stop, counts, isRef));
}
/**
* Return a count appropriate for a kmer starting at kmerStart in sequence for kmers
*
* @param seqForKmers a non-null sequence for kmers object
* @param kmerStart the position where the kmer starts in sequence
* @return a count for a kmer from start -> start + kmerSize in seqForKmers
*/
private int getCountGivenKmerStart(final SequenceForKmers seqForKmers, final int kmerStart) {
return seqForKmers.getCount(kmerStart + kmerSize - 1);
}
/**
* Thread sequence seqForKmers through the current graph, updating the graph as appropriate
* @param seqForKmers a non-null sequence
*/
private void threadSequence(final SequenceForKmers seqForKmers) {
final Pair<MultiDeBruijnVertex,Integer> startingInfo = findStart(seqForKmers);
if ( startingInfo == null )
return;
final MultiDeBruijnVertex startingVertex = startingInfo.getFirst();
final int uniqueStartPos = startingInfo.getSecond();
// increase the counts of all edges incoming into the starting vertex supported by going back in sequence
if ( increaseCountsBackwards )
increaseCountsInMatchedKmers(seqForKmers, startingVertex, startingVertex.getSequence(), kmerSize - 2);
if ( debugGraphTransformations ) startingVertex.addRead(seqForKmers.name);
// keep track of information about the reference kmers for merging dangling tails
if ( seqForKmers.isRef ) {
if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev " + refSource + " new is " + startingVertex);
refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize);
refSeq = seqForKmers.sequence;
refKmers = new MultiDeBruijnVertex[refSeq.length];
for ( int i = 0; i < kmerSize; i++ ) refKmers[i] = null;
}
// loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate
MultiDeBruijnVertex vertex = startingVertex;
for ( int i = uniqueStartPos + 1; i <= seqForKmers.stop - kmerSize; i++ ) {
final int count = getCountGivenKmerStart(seqForKmers, i);
vertex = extendChainByOne(vertex, seqForKmers.sequence, i, count, seqForKmers.isRef);
if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name);
// keep track of the reference kmers for merging dangling tails
if ( seqForKmers.isRef ) refKmers[i + kmerSize - 1] = vertex;
}
}
/**
* Attempt to attach vertex with out-degree == 0 to the graph by finding a unique matching kmer to the reference
* @param vertex the vertex to recover
*/
protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) {
if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0");
final byte[] kmer = vertex.getSequence();
if ( ! nonUniqueKmers.contains(new Kmer(kmer)) ) {
// don't attempt to fix non-unique kmers!
final MultiDeBruijnVertex uniqueMergePoint = danglingTailMergePoint(kmer);
if ( uniqueMergePoint != null ) {
addEdge(vertex, uniqueMergePoint, new MultiSampleEdge(false, 1));
return 1;
}
}
return 0;
}
/**
* Find a unique merge point for kmer in the reference sequence
* @param kmer the full kmer of the dangling tail
* @return a vertex appropriate to merge kmer into, or null if none could be found
*/
private MultiDeBruijnVertex danglingTailMergePoint(final byte[] kmer) {
final PrimitivePair.Int endAndLength = GraphUtils.findLongestUniqueSuffixMatch(refSeq, kmer);
if ( endAndLength != null && endAndLength.second >= MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL && endAndLength.first + 1 < refKmers.length) {
final int len = endAndLength.second;
final MultiDeBruijnVertex mergePoint = refKmers[endAndLength.first + 1];
// logger.info("recoverDanglingChain of kmer " + new String(kmer) + " merged to " + mergePoint + " with match size " + len);
final Set<Kmer> nonUniquesAtLength = determineKmerSizeAndNonUniques(len, len).nonUniques;
final Kmer matchedKmer = new Kmer(kmer, kmer.length - len, len);
if ( nonUniquesAtLength.contains(matchedKmer) ) {
// logger.info("Rejecting merge " + new String(kmer) + " because match kmer " + matchedKmer + " isn't unique across all reads");
return null;
} else {
return mergePoint;
}
}
return null;
}
/**
* Build the read threaded assembly graph if it hasn't already been constructed from the sequences that have
* been added to the graph.
*/
public void buildGraphIfNecessary() {
if ( alreadyBuilt ) return;
// determine the kmer size we'll uses, and capture the set of nonUniques for that kmer size
final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize);
nonUniqueKmers = result.nonUniques;
if ( DEBUG_NON_UNIQUE_CALC ) {
logger.info("using " + kmerSize + " kmer size for this assembly with the following non-uniques");
}
// go through the pending sequences, and add them to the graph
for ( final List<SequenceForKmers> sequencesForSample : pending.values() ) {
for ( final SequenceForKmers sequenceForKmers : sequencesForSample ) {
threadSequence(sequenceForKmers);
if ( WRITE_GRAPH ) printGraph(new File("threading." + counter++ + "." + sequenceForKmers.name.replace(" ", "_") + ".dot"), 0);
}
// flush the single sample edge values from the graph
for ( final MultiSampleEdge e : edgeSet() ) e.flushSingleSampleMultiplicity();
}
// clear
pending.clear();
alreadyBuilt = true;
}
public void recoverDanglingTails() {
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
int attempted = 0;
int nRecovered = 0;
for ( final MultiDeBruijnVertex v : vertexSet() ) {
if ( outDegreeOf(v) == 0 && ! isRefNodeAndRefSink(v) ) {
attempted++;
nRecovered += recoverDanglingChain(v);
}
}
//logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails");
}
/** structure that keeps track of the non-unique kmers for a given kmer size */
private static class NonUniqueResult {
final Set<Kmer> nonUniques;
final int kmerSize;
private NonUniqueResult(Set<Kmer> nonUniques, int kmerSize) {
this.nonUniques = nonUniques;
this.kmerSize = kmerSize;
}
}
/**
* Compute the smallest kmer size >= minKmerSize and <= maxKmerSize that has no non-unique kmers
* among all sequences added to the current graph. Will always return a result for maxKmerSize if
* all smaller kmers had non-unique kmers.
*
* @param minKmerSize the minimum kmer size to consider when constructing the graph
* @param maxKmerSize the maximum kmer size to consider
* @return a non-null NonUniqueResult
*/
protected NonUniqueResult determineKmerSizeAndNonUniques(final int minKmerSize, final int maxKmerSize) {
final Collection<SequenceForKmers> withNonUniques = getAllPendingSequences();
final Set<Kmer> nonUniqueKmers = new HashSet<Kmer>();
// go through the sequences and determine which kmers aren't unique within each read
int kmerSize = minKmerSize;
for ( ; kmerSize <= maxKmerSize; kmerSize++) {
// clear out set of non-unique kmers
nonUniqueKmers.clear();
// loop over all sequences that have non-unique kmers in them from the previous iterator
final Iterator<SequenceForKmers> it = withNonUniques.iterator();
while ( it.hasNext() ) {
final SequenceForKmers sequenceForKmers = it.next();
// determine the non-unique kmers for this sequence
final Collection<Kmer> nonUniquesFromSeq = determineNonUniqueKmers(sequenceForKmers, kmerSize);
if ( nonUniquesFromSeq.isEmpty() ) {
// remove this sequence from future consideration
it.remove();
} else {
// keep track of the non-uniques for this kmerSize, and keep it in the list of sequences that have non-uniques
nonUniqueKmers.addAll(nonUniquesFromSeq);
}
}
if ( nonUniqueKmers.isEmpty() )
// this kmerSize produces no non-unique sequences, so go ahead and use it for our assembly
break;
}
// necessary because the loop breaks with kmerSize = max + 1
return new NonUniqueResult(nonUniqueKmers, Math.min(kmerSize, maxKmerSize));
}
/**
* Get the collection of all sequences for kmers across all samples in no particular order
* @return non-null Collection
*/
private Collection<SequenceForKmers> getAllPendingSequences() {
final LinkedList<SequenceForKmers> result = new LinkedList<SequenceForKmers>();
for ( final List<SequenceForKmers> oneSampleWorth : pending.values() ) result.addAll(oneSampleWorth);
return result;
}
/**
* Get the collection of non-unique kmers from sequence for kmer size kmerSize
* @param seqForKmers a sequence to get kmers from
* @param kmerSize the size of the kmers
* @return a non-null collection of non-unique kmers in sequence
*/
private Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) {
// count up occurrences of kmers within each read
final KMerCounter counter = new KMerCounter(kmerSize);
for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) {
final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
counter.addKmer(kmer, 1);
}
return counter.getKmersWithCountsAtLeast(2);
}
/**
* Convert this kmer graph to a simple sequence graph.
*
* Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
* graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
*
* @return a newly allocated SequenceGraph
*/
// TODO -- should override base class method
public SeqGraph convertToSequenceGraph() {
buildGraphIfNecessary();
final SeqGraph seqGraph = new SeqGraph(kmerSize);
final Map<MultiDeBruijnVertex, SeqVertex> vertexMap = new HashMap<MultiDeBruijnVertex, SeqVertex>();
// create all of the equivalent seq graph vertices
for ( final MultiDeBruijnVertex dv : vertexSet() ) {
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
sv.setAdditionalInfo(dv.additionalInfo());
vertexMap.put(dv, sv);
seqGraph.addVertex(sv);
}
// walk through the nodes and connect them to their equivalent seq vertices
for( final MultiSampleEdge e : edgeSet() ) {
final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
//logger.info("Adding edge " + seqInV + " -> " + seqOutV);
seqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity()));
}
return seqGraph;
}
private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers,
final MultiDeBruijnVertex vertex,
final byte[] originalKmer,
final int offset) {
if ( offset == -1 ) return;
for ( final MultiSampleEdge edge : incomingEdgesOf(vertex) ) {
final MultiDeBruijnVertex prev = getEdgeSource(edge);
final byte suffix = prev.getSuffix();
final byte seqBase = originalKmer[offset];
// logger.warn(String.format("Increasing counts for %s -> %s via %s at %d with suffix %s vs. %s",
// prev, vertex, edge, offset, (char)suffix, (char)seqBase));
if ( suffix == seqBase && (increaseCountsThroughBranches || inDegreeOf(vertex) == 1) ) {
edge.incMultiplicity(seqForKmers.getCount(offset));
increaseCountsInMatchedKmers(seqForKmers, prev, originalKmer, offset-1);
}
}
}
/**
* Find vertex and its position in seqForKmers where we should start assembling seqForKmers
*
* @param seqForKmers the sequence we want to thread into the graph
* @return a pair of the starting vertex and its position in seqForKmer
*/
private Pair<MultiDeBruijnVertex, Integer> findStart(final SequenceForKmers seqForKmers) {
final int uniqueStartPos = seqForKmers.isRef ? 0 : findUniqueStartPosition(seqForKmers.sequence, seqForKmers.start, seqForKmers.stop);
if ( uniqueStartPos == -1 )
return null;
return getOrCreateKmerVertex(seqForKmers.sequence, uniqueStartPos, true);
}
/**
* Find a starting point in sequence that begins a unique kmer among all kmers in the graph
* @param sequence the sequence of bases
* @param start the first base to use in sequence
* @param stop the last base to use in sequence
* @return the index into sequence that begins a unique kmer of size kmerSize, or -1 if none could be found
*/
private int findUniqueStartPosition(final byte[] sequence, final int start, final int stop) {
for ( int i = start; i < stop - kmerSize; i++ ) {
final Kmer kmer1 = new Kmer(sequence, i, kmerSize);
if ( uniqueKmers.containsKey(kmer1) )
return i;
}
return -1;
}
/**
* Get the vertex for the kmer in sequence starting at start
* @param sequence the sequence
* @param start the position of the kmer start
* @param allowRefSource if true, we will allow matches to the kmer that represents the reference starting kmer
* @return a non-null vertex
*/
private Pair<MultiDeBruijnVertex, Integer> getOrCreateKmerVertex(final byte[] sequence, final int start, final boolean allowRefSource) {
final Kmer kmer = new Kmer(sequence, start, kmerSize);
final MultiDeBruijnVertex vertex = getUniqueKmerVertex(kmer, allowRefSource);
if ( vertex != null ) {
return new Pair<>(vertex, start);
} else {
return new Pair<>(createVertex(kmer), start);
}
}
/**
* Get the unique vertex for kmer, or null if not possible.
*
* @param allowRefSource if true, we will allow kmer to match the reference source vertex
* @return a vertex for kmer, or null if it's not unique
*/
private MultiDeBruijnVertex getUniqueKmerVertex(final Kmer kmer, final boolean allowRefSource) {
if ( ! allowRefSource && kmer.equals(refSource) ) return null;
return uniqueKmers.get(kmer);
}
/**
* Create a new vertex for kmer. Add it to the uniqueKmers map if appropriate.
*
* kmer must not have a entry in unique kmers, or an error will be thrown
*
* @param kmer the kmer we want to create a vertex for
* @return the non-null created vertex
*/
private MultiDeBruijnVertex createVertex(final Kmer kmer) {
final MultiDeBruijnVertex newVertex = new MultiDeBruijnVertex(kmer.bases());
final int prevSize = vertexSet().size();
addVertex(newVertex);
// make sure we aren't adding duplicates (would be a bug)
if ( vertexSet().size() != prevSize + 1) throw new IllegalStateException("Adding vertex " + newVertex + " to graph didn't increase the graph size");
// add the vertex to the unique kmer map, if it is in fact unique
if ( ! nonUniqueKmers.contains(kmer) && ! uniqueKmers.containsKey(kmer) ) // TODO -- not sure this last test is necessary
uniqueKmers.put(kmer, newVertex);
return newVertex;
}
/**
* Workhorse routine of the assembler. Given a sequence whose last vertex is anchored in the graph, extend
* the graph one bp according to the bases in sequence.
*
* @param prevVertex a non-null vertex where sequence was last anchored in the graph
* @param sequence the sequence we're threading through the graph
* @param kmerStart the start of the current kmer in graph we'd like to add
* @param count the number of observations of this kmer in graph (can be > 1 for reduced reads)
* @param isRef is this the reference sequence?
* @return a non-null vertex connecting prevVertex to in the graph based on sequence
*/
private MultiDeBruijnVertex extendChainByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final int count, final boolean isRef) {
final Set<MultiSampleEdge> outgoingEdges = outgoingEdgesOf(prevVertex);
final int nextPos = kmerStart + kmerSize - 1;
for ( final MultiSampleEdge outgoingEdge : outgoingEdges ) {
final MultiDeBruijnVertex target = getEdgeTarget(outgoingEdge);
if ( target.getSuffix() == sequence[nextPos] ) {
// we've got a match in the chain, so simply increase the count of the edge by 1 and continue
outgoingEdge.incMultiplicity(count);
return target;
}
}
// none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize);
MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
if ( isRef && uniqueMergeVertex != null )
throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex);
// either use our unique merge vertex, or create a new one in the chain
final MultiDeBruijnVertex nextVertex = uniqueMergeVertex == null ? createVertex(kmer) : uniqueMergeVertex;
addEdge(prevVertex, nextVertex, new MultiSampleEdge(isRef, count));
return nextVertex;
}
/**
* Get the start and stop positions (exclusive) of the longest stretch of high quality bases
* in read
*
* @param read a non-null read
* @return the start and stop for high quality bases in read, or null if none exist
*/
protected void addRead(final GATKSAMRecord read) {
final byte[] sequence = read.getReadBases();
final byte[] qualities = read.getBaseQualities();
final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
int lastGood = -1; // the index of the last good base we've seen
for( int end = 0; end <= sequence.length; end++ ) {
if ( end == sequence.length || qualities[end] < minBaseQualityToUseInAssembly ) {
// the first good base is at lastGood, can be -1 if last base was bad
final int start = lastGood;
// the stop base is end - 1 (if we're not at the end of the sequence)
final int stop = end == sequence.length ? sequence.length : end;
final int len = stop - start + 1;
if ( start != -1 && len >= kmerSize ) {
// if the sequence is long enough to get some value out of, add it to the graph
final String name = read.getReadName() + "_" + start + "_" + end;
addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, stop, reducedReadCounts, false);
}
lastGood = -1; // reset the last good base
} else if ( lastGood == -1 ) {
lastGood = end; // we're at a good base, the last good one is us
}
}
}
/**
* Get the set of non-unique kmers in this graph. For debugging purposes
* @return a non-null set of kmers
*/
protected Set<Kmer> getNonUniqueKmers() {
return nonUniqueKmers;
}
@Override
public String toString() {
return "ReadThreadingAssembler{" +
"kmerSize=" + kmerSize +
'}';
}
}

View File

@ -0,0 +1,93 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
/**
* Keeps track of the information needed to add a sequence to the read threading assembly graph
*
* User: depristo
* Date: 4/18/13
* Time: 8:59 AM
* To change this template use File | Settings | File Templates.
*/
final class SequenceForKmers {
final String name;
final byte[] sequence;
final int start, stop;
final private int[] counts;
final boolean isRef;
/**
* Create a new sequence for creating kmers
*/
SequenceForKmers(final String name, byte[] sequence, int start, int stop, int[] counts, boolean ref) {
if ( start < 0 ) throw new IllegalArgumentException("Invalid start " + start);
if ( stop < start ) throw new IllegalArgumentException("Invalid stop " + stop);
if ( sequence == null ) throw new IllegalArgumentException("Sequence is null ");
if ( counts != null && counts.length != sequence.length ) throw new IllegalArgumentException("Sequence and counts don't have the same length " + sequence.length + " vs " + counts.length);
this.name = name;
this.sequence = sequence;
this.start = start;
this.stop = stop;
this.isRef = ref;
this.counts = counts;
}
/**
* Get the number of observations of the kmer starting at i in this sequence
*
* Can we > 1 because sequence may be a reduced read and therefore count as N observations
*
* @param i the offset into sequence for the start of the kmer
* @return a count >= 1 that indicates the number of observations of kmer starting at i in this sequence.
*/
public int getCount(final int i) {
if ( i < 0 || i > sequence.length ) throw new ArrayIndexOutOfBoundsException("i must be >= 0 and <= " + sequence.length + " but got " + i);
return counts == null ? 1 : counts[i];
}
}

View File

@ -212,6 +212,15 @@ public class ConstrainedMateFixingManager {
public int getNReadsInQueue() { return waitingReads.size(); }
/**
* For testing purposes only
*
* @return the list of reads currently in the queue
*/
protected List<SAMRecord> getReadsInQueueForTesting() {
return new ArrayList<SAMRecord>(waitingReads);
}
public boolean canMoveReads(GenomeLoc earliestPosition) {
if ( DEBUG ) logger.info("Refusing to realign? " + earliestPosition + " vs. " + lastLocFlushed);
@ -233,7 +242,7 @@ public class ConstrainedMateFixingManager {
addRead(newRead, modifiedReads.contains(newRead), false);
}
private void addRead(SAMRecord newRead, boolean readWasModified, boolean canFlush) {
protected void addRead(SAMRecord newRead, boolean readWasModified, boolean canFlush) {
if ( DEBUG ) logger.info("New read pos " + newRead.getAlignmentStart() + " OP = " + newRead.getAttribute("OP") + " " + readWasModified);
//final long curTime = timer.currentTime();
@ -265,7 +274,7 @@ public class ConstrainedMateFixingManager {
// fix mates, as needed
// Since setMateInfo can move reads, we potentially need to remove the mate, and requeue
// it to ensure proper sorting
if ( newRead.getReadPairedFlag() ) {
if ( newRead.getReadPairedFlag() && !newRead.getNotPrimaryAlignmentFlag() ) {
SAMRecordHashObject mate = forMateMatching.get(newRead.getReadName());
if ( mate != null ) {
// 1. Frustratingly, Picard's setMateInfo() method unaligns (by setting the reference contig

View File

@ -93,10 +93,13 @@ public class ReadGroupCovariate implements RequiredCovariate {
private final HashMap<String, Integer> readGroupLookupTable = new HashMap<String, Integer>();
private final HashMap<Integer, String> readGroupReverseLookupTable = new HashMap<Integer, String>();
private int nextId = 0;
private String forceReadGroup;
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {}
public void initialize(final RecalibrationArgumentCollection RAC) {
forceReadGroup = RAC.FORCE_READGROUP;
}
@Override
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
@ -170,6 +173,9 @@ public class ReadGroupCovariate implements RequiredCovariate {
* @return platform unit or readgroup id
*/
private String readGroupValueFromRG(final GATKSAMReadGroupRecord rg) {
if ( forceReadGroup != null )
return forceReadGroup;
final String platformUnit = rg.getPlatformUnit();
return platformUnit == null ? rg.getId() : platformUnit;
}

View File

@ -53,6 +53,7 @@ import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class ReduceReadsIntegrationTest extends WalkerTest {
@ -221,13 +222,13 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testCoReduction() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5f4d2c1d9c010dfd6865aeba7d0336fe")), COREDUCTION_QUALS_TEST_MD5);
}
@Test(enabled = true)
public void testCoReductionWithKnowns() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s ";
String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s ";
executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("ca48dd972bf57595c691972c0f887cb4")), COREDUCTION_QUALS_TEST_MD5);
}
@ -281,5 +282,24 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
" -o %s --downsample_coverage 250 -dcov 50 ";
executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7e7b358443827ca239db3b98f299aec6")), "2af063d1bd3c322b03405dbb3ecf59a9");
}
/**
* Confirm that this bam does not fail when multi-sample mode is enabled. The provided example is tricky and used to cause
* us to exception out in the code.
*/
@Test(enabled = true)
public void testMultiSampleDoesNotFailWithFlag() {
String cmd = "-T ReduceReads --cancer_mode -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null";
executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, Collections.<String>emptyList()));
}
/**
* Confirm that this bam fails when multi-sample mode is not enabled
*/
@Test(enabled = true)
public void testMultiSampleFailsWithoutFlag() {
String cmd = "-T ReduceReads -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null";
executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, UserException.BadInput.class));
}
}

View File

@ -80,59 +80,6 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
}
@Test(enabled = !DEBUG)
public void testLeftAlignCigarSequentially() {
String preRefString = "GATCGATCGATC";
String postRefString = "TTT";
String refString = "ATCGAGGAGAGCGCCCCG";
String indelString1 = "X";
String indelString2 = "YZ";
int refIndel1 = 10;
int refIndel2 = 12;
for ( final int indelSize1 : Arrays.asList(1, 2, 3, 4) ) {
for ( final int indelOp1 : Arrays.asList(1, -1) ) {
for ( final int indelSize2 : Arrays.asList(1, 2, 3, 4) ) {
for ( final int indelOp2 : Arrays.asList(1, -1) ) {
Cigar expectedCigar = new Cigar();
expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M));
expectedCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D)));
expectedCigar.add(new CigarElement((indelOp1 < 0 ? refIndel1 - indelSize1 : refIndel1), CigarOperator.M));
expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M));
expectedCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D)));
expectedCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2 - indelSize2) * 2 : refIndel2 * 2), CigarOperator.M));
expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M));
Cigar givenCigar = new Cigar();
givenCigar.add(new CigarElement(refString.length() + refIndel1/2, CigarOperator.M));
givenCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D)));
givenCigar.add(new CigarElement((indelOp1 < 0 ? (refIndel1/2 - indelSize1) : refIndel1/2) + refString.length() + refIndel2/2 * 2, CigarOperator.M));
givenCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D)));
givenCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2/2 - indelSize2) * 2 : refIndel2/2 * 2) + refString.length(), CigarOperator.M));
String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString;
String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString;
Cigar calculatedCigar = new DeBruijnAssembler().leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0);
Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!");
}
}
}
}
}
@Test(enabled = true)
public void testLeftAlignCigarSequentiallyAdjacentID() {
final String ref = "GTCTCTCTCTCTCTCTCTATATATATATATATATTT";
final String hap = "GTCTCTCTCTCTCTCTCTCTCTATATATATATATTT";
final Cigar originalCigar = TextCigarCodec.getSingleton().decode("18M4I12M4D2M");
final Cigar result = new DeBruijnAssembler().leftAlignCigarSequentially(originalCigar, ref.getBytes(), hap.getBytes(), 0, 0);
logger.warn("Result is " + result);
Assert.assertEquals(originalCigar.getReferenceLength(), result.getReferenceLength(), "Reference lengths are different");
}
private static class MockBuilder extends DeBruijnGraphBuilder {
public final List<Kmer> addedPairs = new LinkedList<Kmer>();

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "0bf5ae740bf9bd14c8d60d7849c45eb3");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fc11b553fbf16beac0da04a69f419365");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"7d2cc5c4ece386beedf6b07dfbe5bf26");
"90cbcc7e959eb591fb7c5e12d65e0e40");
}
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"a17856f709b546eaed486841d78248d2");
"50894abb9d156bf480881cb5cb2a8a7d");
}
}

View File

@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "2e10ab97afd4492c2a153b85871a2c2d");
HCTest(CEUTRIO_BAM, "", "37e462379de17bc6c8aeeed6e9735dd3");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "affed81386dfe60e0b0d4e7e0525918f");
HCTest(NA12878_BAM, "", "983a0d122714d4aa0ff7af20cc686703");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"e2d32d0dce2c5502a8e877f6bbb65a10");
"dbbc884a975587d8e7255ce47b58f438");
}
@Test
@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "125e91ebe43108b2b514c58a9b6d3a4f");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "ce602282e80cca6d4272f940e20e90c3");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerNearbySmallIntervals() {
HCTestNearbySmallIntervals(NA12878_BAM, "", "2d295ce36066d9d8d9ee9c67e6e2cbd1");
HCTestNearbySmallIntervals(NA12878_BAM, "", "09335c01d2e90714af7f4c91156da0b1");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -159,14 +159,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b34ddc93a7b9919e05da499508f44dd9"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("153d2251de7d22f423cd282b1505fbc0"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("98a78b9f58ab197b827ef2ce3ab043d3"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("0c29e4049908ec47a3159dce33d477c3"));
Arrays.asList("6e6ef6e0326bee6d20d9fd37349fdb8c"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("3306889b8d0735ce575bee281c1b8846"));
Arrays.asList("5e535983b2f7e5fb6c84fecffa092324"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
}

View File

@ -0,0 +1,79 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
@DataProvider(name = "NCTDataProvider")
public Object[][] makeNCTDataProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "c277fd65365d59b734260dd8423313bb"});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "NCTDataProvider")
public void testHCNCT(final int nct, final String md5) {
WalkerTestSpec spec = new WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I "
+ privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " +
" -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1,
Arrays.asList(md5));
executeTest("HC test parallel HC with NCT with nct " + nct, spec);
}
}

View File

@ -1,48 +1,48 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
@ -50,6 +50,9 @@ import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.HashSet;
import java.util.Set;
public class KMerCounterCaseFixUnitTest extends BaseTest {
@Test
public void testMyData() {
@ -76,6 +79,18 @@ public class KMerCounterCaseFixUnitTest extends BaseTest {
testCounting(counter, "NNC", 0);
Assert.assertNotNull(counter.toString());
assertCounts(counter, 5);
assertCounts(counter, 4, "ATG");
assertCounts(counter, 3, "ATG", "ACC");
assertCounts(counter, 2, "ATG", "ACC", "AAA");
assertCounts(counter, 1, "ATG", "ACC", "AAA", "CTG", "NNA", "CCC");
}
private void assertCounts(final KMerCounter counter, final int minCount, final String ... expecteds) {
final Set<Kmer> expected = new HashSet<Kmer>();
for ( final String one : expecteds ) expected.add(new Kmer(one));
Assert.assertEquals(new HashSet<Kmer>(counter.getKmersWithCountsAtLeast(minCount)), expected);
}
private void testCounting(final KMerCounter counter, final String in, final int expectedCount) {

View File

@ -0,0 +1,280 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
public class LocalAssemblyEngineUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private IndexedFastaSequenceFile seq;
private SAMFileHeader header;
@BeforeClass
public void setup() throws FileNotFoundException {
seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
genomeLocParser = new GenomeLocParser(seq);
header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
}
private enum Assembler {DEBRUIJN_ASSEMBLER, READ_THREADING_ASSEMBLER}
private LocalAssemblyEngine createAssembler(final Assembler type) {
switch ( type ) {
case DEBRUIJN_ASSEMBLER: return new DeBruijnAssembler();
case READ_THREADING_ASSEMBLER: return new ReadThreadingAssembler();
default: throw new IllegalStateException("Unexpected " + type);
}
}
@DataProvider(name = "AssembleIntervalsData")
public Object[][] makeAssembleIntervalsData() {
List<Object[]> tests = new ArrayList<Object[]>();
final String contig = "20";
final int start = 10000000;
final int end = 10100000;
final int windowSize = 100;
final int stepSize = 200;
final int nReadsToUse = 5;
for ( final Assembler assembler : Assembler.values() ) {
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
tests.add(new Object[]{assembler, refLoc, nReadsToUse});
}
}
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "AssembleIntervalsWithVariantData")
public Object[][] makeAssembleIntervalsWithVariantData() {
List<Object[]> tests = new ArrayList<Object[]>();
final String contig = "20";
final int start = 10000000;
final int end = 10001000;
final int windowSize = 100;
final int stepSize = 200;
final int variantStepSize = 1;
final int nReadsToUse = 5;
for ( final Assembler assembler : Assembler.values() ) {
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
for ( int variantStart = windowSize / 2 - 10; variantStart < windowSize / 2 + 10; variantStart += variantStepSize ) {
tests.add(new Object[]{assembler, refLoc, nReadsToUse, variantStart});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "AssembleIntervalsData")
public void testAssembleRef(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
for ( int i = 0; i < nReadsToUse; i++ ) {
final byte[] bases = refBases.clone();
final byte[] quals = Utils.dupBytes((byte) 30, refBases.length);
final String cigar = refBases.length + "M";
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, loc.getContig(), loc.getContigIndex(), loc.getStart(), bases, quals, cigar);
reads.add(read);
}
// TODO -- generalize to all assemblers
final Haplotype refHaplotype = new Haplotype(refBases, true);
final List<Haplotype> haplotypes = assemble(assembler, refBases, loc, reads);
Assert.assertEquals(haplotypes, Collections.singletonList(refHaplotype));
}
@Test(dataProvider = "AssembleIntervalsWithVariantData")
public void testAssembleRefAndSNP(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
final Allele refBase = Allele.create(refBases[variantSite], true);
final Allele altBase = Allele.create((byte)(refBase.getBases()[0] == 'A' ? 'C' : 'A'), false);
final VariantContextBuilder vcb = new VariantContextBuilder("x", loc.getContig(), variantSite, variantSite, Arrays.asList(refBase, altBase));
testAssemblyWithVariant(assembler, refBases, loc, nReadsToUse, vcb.make());
}
@Test(dataProvider = "AssembleIntervalsWithVariantData")
public void testAssembleRefAndDeletion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
for ( int deletionLength = 1; deletionLength < 10; deletionLength++ ) {
final Allele refBase = Allele.create(new String(refBases).substring(variantSite, variantSite + deletionLength + 1), true);
final Allele altBase = Allele.create(refBase.getBases()[0], false);
final VariantContextBuilder vcb = new VariantContextBuilder("x", loc.getContig(), variantSite, variantSite + deletionLength, Arrays.asList(refBase, altBase));
testAssemblyWithVariant(assembler, refBases, loc, nReadsToUse, vcb.make());
}
}
@Test(dataProvider = "AssembleIntervalsWithVariantData")
public void testAssembleRefAndInsertion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
for ( int insertionLength = 1; insertionLength < 10; insertionLength++ ) {
final Allele refBase = Allele.create(refBases[variantSite], false);
final Allele altBase = Allele.create(new String(refBases).substring(variantSite, variantSite + insertionLength + 1), true);
final VariantContextBuilder vcb = new VariantContextBuilder("x", loc.getContig(), variantSite, variantSite + insertionLength, Arrays.asList(refBase, altBase));
testAssemblyWithVariant(assembler, refBases, loc, nReadsToUse, vcb.make());
}
}
private void testAssemblyWithVariant(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final int nReadsToUse, final VariantContext site) {
final String preRef = new String(refBases).substring(0, site.getStart());
final String postRef = new String(refBases).substring(site.getEnd() + 1, refBases.length);
final byte[] altBases = (preRef + site.getAlternateAllele(0).getBaseString() + postRef).getBytes();
// logger.warn("ref " + new String(refBases));
// logger.warn("alt " + new String(altBases));
final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
for ( int i = 0; i < nReadsToUse; i++ ) {
final byte[] bases = altBases.clone();
final byte[] quals = Utils.dupBytes((byte) 30, altBases.length);
final String cigar = altBases.length + "M";
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, loc.getContig(), loc.getContigIndex(), loc.getStart(), bases, quals, cigar);
reads.add(read);
}
final Haplotype refHaplotype = new Haplotype(refBases, true);
final Haplotype altHaplotype = new Haplotype(altBases, false);
final List<Haplotype> haplotypes = assemble(assembler, refBases, loc, reads);
Assert.assertEquals(haplotypes, Arrays.asList(refHaplotype, altHaplotype));
}
private List<Haplotype> assemble(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final List<GATKSAMRecord> reads) {
final Haplotype refHaplotype = new Haplotype(refBases, true);
final ActiveRegion activeRegion = new ActiveRegion(loc, null, true, genomeLocParser, 0);
activeRegion.addAll(reads);
final LocalAssemblyEngine engine = createAssembler(assembler);
// logger.warn("Assembling " + activeRegion + " with " + engine);
return engine.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.<VariantContext>emptyList());
}
@DataProvider(name = "SimpleAssemblyTestData")
public Object[][] makeSimpleAssemblyTestData() {
List<Object[]> tests = new ArrayList<Object[]>();
final String contig = "20";
final int start = 10000000;
final int windowSize = 200;
final int end = start + windowSize;
final Map<Assembler, Integer> edgeExcludesByAssembler = new EnumMap<>(Assembler.class);
edgeExcludesByAssembler.put(Assembler.DEBRUIJN_ASSEMBLER, 26);
edgeExcludesByAssembler.put(Assembler.READ_THREADING_ASSEMBLER, 25); // TODO -- decrease to zero when the edge calling problem is fixed
final String ref = new String(seq.getSubsequenceAt(contig, start, end).getBases());
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, start, end);
for ( final Assembler assembler : Assembler.values() ) {
final int excludeVariantsWithXbp = edgeExcludesByAssembler.get(assembler);
for ( int snpPos = 0; snpPos < windowSize; snpPos++) {
if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) {
final byte[] altBases = ref.getBytes();
altBases[snpPos] = 'N';
final String alt = new String(altBases);
tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "SimpleAssemblyTestData")
public void testSimpleAssembly(final String name, final Assembler assembler, final GenomeLoc loc, final String ref, final String alt) {
final byte[] refBases = ref.getBytes();
final byte[] altBases = alt.getBytes();
final List<GATKSAMRecord> reads = new LinkedList<>();
for ( int i = 0; i < 20; i++ ) {
final byte[] bases = altBases.clone();
final byte[] quals = Utils.dupBytes((byte) 30, altBases.length);
final String cigar = altBases.length + "M";
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, loc.getContig(), loc.getContigIndex(), loc.getStart(), bases, quals, cigar);
reads.add(read);
}
final Haplotype refHaplotype = new Haplotype(refBases, true);
final Haplotype altHaplotype = new Haplotype(altBases, false);
final List<Haplotype> haplotypes = assemble(assembler, refBases, loc, reads);
Assert.assertTrue(haplotypes.size() > 0, "Failed to find ref haplotype");
Assert.assertEquals(haplotypes.get(0), refHaplotype);
Assert.assertEquals(haplotypes.size(), 2, "Failed to find single alt haplotype");
Assert.assertEquals(haplotypes.get(1), altHaplotype);
}
}

View File

@ -83,7 +83,10 @@ public class BaseEdgeUnitTest extends BaseTest {
e.setMultiplicity(mult + 1);
Assert.assertEquals(e.getMultiplicity(), mult + 1);
final BaseEdge copy = new BaseEdge(e);
e.incMultiplicity(2);
Assert.assertEquals(e.getMultiplicity(), mult + 3);
final BaseEdge copy = e.copy();
Assert.assertEquals(copy.isRef(), e.isRef());
Assert.assertEquals(copy.getMultiplicity(), e.getMultiplicity());
}

View File

@ -49,8 +49,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import scala.actors.threadpool.Arrays;
import java.io.File;
import java.util.*;

View File

@ -137,12 +137,12 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) {
try {
final Set<String> haplotypes = new HashSet<String>();
final List<Path<SeqVertex>> originalPaths = new KBestPaths<SeqVertex>().getKBestPaths(original);
for ( final Path<SeqVertex> path : originalPaths )
final List<Path<SeqVertex,BaseEdge>> originalPaths = new KBestPaths<SeqVertex,BaseEdge>().getKBestPaths(original);
for ( final Path<SeqVertex,BaseEdge> path : originalPaths )
haplotypes.add(new String(path.getBases()));
final List<Path<SeqVertex>> splitPaths = new KBestPaths<SeqVertex>().getKBestPaths(actual);
for ( final Path<SeqVertex> path : splitPaths ) {
final List<Path<SeqVertex,BaseEdge>> splitPaths = new KBestPaths<SeqVertex,BaseEdge>().getKBestPaths(actual);
for ( final Path<SeqVertex,BaseEdge> path : splitPaths ) {
final String h = new String(path.getBases());
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
}

View File

@ -154,16 +154,16 @@ public class CommonSuffixSplitterUnitTest extends BaseTest {
original.addEdge(v3, v4, new BaseEdge(false, 34));
original.addEdge(v4, v2, new BaseEdge(false, 42));
original.printGraph(new File("testSplitInfiniteCycleFailure.dot"), 0);
// original.printGraph(new File("testSplitInfiniteCycleFailure.dot"), 0);
final SeqGraph graph = (SeqGraph)original.clone();
final boolean success = new CommonSuffixSplitter().split(graph, v2);
Assert.assertTrue(success);
for ( final SeqVertex v : graph.vertexSet() ) {
graph.printGraph(new File("testSplitInfiniteCycleFailure.first_split.dot"), 0);
// graph.printGraph(new File("testSplitInfiniteCycleFailure.first_split.dot"), 0);
final boolean success2 = new CommonSuffixSplitter().split((SeqGraph)graph.clone(), v);
if ( success2 ) graph.printGraph(new File("testSplitInfiniteCycleFailure.fail.dot"), 0);
// if ( success2 ) graph.printGraph(new File("testSplitInfiniteCycleFailure.fail.dot"), 0);
Assert.assertFalse(success2, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v);
}
}

View File

@ -0,0 +1,120 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class GraphUtilsUnitTest extends BaseTest {
@DataProvider(name = "findLongestUniqueMatchData")
public Object[][] makefindLongestUniqueMatchData() {
List<Object[]> tests = new ArrayList<Object[]>();
{ // test all edge conditions
final String ref = "ACGT";
for ( int start = 0; start < ref.length(); start++ ) {
for ( int end = start + 1; end <= ref.length(); end++ ) {
final String kmer = ref.substring(start, end);
tests.add(new Object[]{ref, kmer, end - 1, end - start});
tests.add(new Object[]{ref, "N" + kmer, end - 1, end - start});
tests.add(new Object[]{ref, "NN" + kmer, end - 1, end - start});
tests.add(new Object[]{ref, kmer + "N", -1, 0});
tests.add(new Object[]{ref, kmer + "NN", -1, 0});
}
}
}
{ // multiple matches
final String ref = "AACCGGTT";
for ( final String alt : Arrays.asList("A", "C", "G", "T") )
tests.add(new Object[]{ref, alt, -1, 0});
tests.add(new Object[]{ref, "AA", 1, 2});
tests.add(new Object[]{ref, "CC", 3, 2});
tests.add(new Object[]{ref, "GG", 5, 2});
tests.add(new Object[]{ref, "TT", 7, 2});
}
{ // complex matches that have unique substrings of lots of parts of kmer in the ref
final String ref = "ACGTACGTACGT";
tests.add(new Object[]{ref, "ACGT", -1, 0});
tests.add(new Object[]{ref, "TACGT", -1, 0});
tests.add(new Object[]{ref, "GTACGT", -1, 0});
tests.add(new Object[]{ref, "CGTACGT", -1, 0});
tests.add(new Object[]{ref, "ACGTACGT", -1, 0});
tests.add(new Object[]{ref, "TACGTACGT", 11, 9});
tests.add(new Object[]{ref, "NTACGTACGT", 11, 9});
tests.add(new Object[]{ref, "GTACGTACGT", 11, 10});
tests.add(new Object[]{ref, "NGTACGTACGT", 11, 10});
tests.add(new Object[]{ref, "CGTACGTACGT", 11, 11});
}
return tests.toArray(new Object[][]{});
}
/**
* Example testng test using MyDataProvider
*/
@Test(dataProvider = "findLongestUniqueMatchData")
public void testfindLongestUniqueMatch(final String seq, final String kmer, final int start, final int length) {
// adaptor this code to do whatever testing you want given the arguments start and size
final PrimitivePair.Int actual = GraphUtils.findLongestUniqueSuffixMatch(seq.getBytes(), kmer.getBytes());
if ( start == -1 )
Assert.assertNull(actual);
else {
Assert.assertNotNull(actual);
Assert.assertEquals(actual.first, start);
Assert.assertEquals(actual.second, length);
}
}
}

View File

@ -114,7 +114,7 @@ public class KBestPathsUnitTest extends BaseTest {
if ( addCycle ) graph.addEdge(middleBottom, middleBottom);
// enumerate all possible paths
final List<Path<SeqVertex>> paths = new KBestPaths<SeqVertex>(allowCycles).getKBestPaths(graph, starts, ends);
final List<Path<SeqVertex,BaseEdge>> paths = new KBestPaths<SeqVertex,BaseEdge>(allowCycles).getKBestPaths(graph, starts, ends);
final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * (addCycle && allowCycles ? 2 : 1) * nEndNodes;
Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths");
@ -127,7 +127,7 @@ public class KBestPathsUnitTest extends BaseTest {
// get the best path, and make sure it's the same as our optimal path overall
final Path best = paths.get(0);
final List<Path<SeqVertex>> justOne = new KBestPaths<SeqVertex>(allowCycles).getKBestPaths(graph, 1, starts, ends);
final List<Path<SeqVertex,BaseEdge>> justOne = new KBestPaths<SeqVertex,BaseEdge>(allowCycles).getKBestPaths(graph, 1, starts, ends);
Assert.assertEquals(justOne.size(), 1);
Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0));
}
@ -147,7 +147,7 @@ public class KBestPathsUnitTest extends BaseTest {
graph.addEdges(v4, v2);
// enumerate all possible paths
final List<Path<SeqVertex>> paths = new KBestPaths<SeqVertex>(false).getKBestPaths(graph, v1, v5);
final List<Path<SeqVertex,BaseEdge>> paths = new KBestPaths<SeqVertex,BaseEdge>(false).getKBestPaths(graph, v1, v5);
Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths");
}
@ -163,7 +163,7 @@ public class KBestPathsUnitTest extends BaseTest {
graph.addEdges(v1, v2, v3, v3);
// enumerate all possible paths
final List<Path<SeqVertex>> paths = new KBestPaths<SeqVertex>(false).getKBestPaths(graph, v1, v3);
final List<Path<SeqVertex,BaseEdge>> paths = new KBestPaths<SeqVertex,BaseEdge>(false).getKBestPaths(graph, v1, v3);
Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths");
}
@ -201,9 +201,9 @@ public class KBestPathsUnitTest extends BaseTest {
graph.addEdge(v2Alt, v3, new BaseEdge(false, 5));
// Construct the test path
Path<SeqVertex> path = new Path<SeqVertex>(v, graph);
path = new Path<SeqVertex>(path, graph.getEdge(v, v2Alt));
path = new Path<SeqVertex>(path, graph.getEdge(v2Alt, v3));
Path<SeqVertex,BaseEdge> path = new Path<SeqVertex,BaseEdge>(v, graph);
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(v, v2Alt));
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(v2Alt, v3));
// Construct the actual cigar string implied by the test path
Cigar expectedCigar = new Cigar();
@ -219,7 +219,8 @@ public class KBestPathsUnitTest extends BaseTest {
}
expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M));
Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");
final String ref = preRef + v2Ref.getSequenceString() + postRef;
Assert.assertEquals(path.calculateCigar(ref.getBytes()).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");
}
@DataProvider(name = "GetBasesData")
@ -251,9 +252,9 @@ public class KBestPathsUnitTest extends BaseTest {
}
// enumerate all possible paths
final List<Path<SeqVertex>> paths = new KBestPaths<SeqVertex>().getKBestPaths(graph);
final List<Path<SeqVertex,BaseEdge>> paths = new KBestPaths<SeqVertex,BaseEdge>().getKBestPaths(graph);
Assert.assertEquals(paths.size(), 1);
final Path<SeqVertex> path = paths.get(0);
final Path<SeqVertex,BaseEdge> path = paths.get(0);
Assert.assertEquals(new String(path.getBases()), Utils.join("", frags), "Path doesn't have the expected sequence");
}
@ -296,6 +297,8 @@ public class KBestPathsUnitTest extends BaseTest {
SeqVertex v7 = new SeqVertex(postRef);
SeqVertex postV = new SeqVertex(postAltOption);
final String ref = preRef + v2Ref.getSequenceString() + midRef1 + v4Ref.getSequenceString() + midRef2 + v6Ref.getSequenceString() + postRef;
graph.addVertex(preV);
graph.addVertex(v);
graph.addVertex(v2Ref);
@ -324,18 +327,18 @@ public class KBestPathsUnitTest extends BaseTest {
graph.addEdge(v7, postV, new BaseEdge(false, 1));
// Construct the test path
Path<SeqVertex> path = new Path<SeqVertex>( (offRefBeginning ? preV : v), graph);
Path<SeqVertex,BaseEdge> path = new Path<SeqVertex,BaseEdge>( (offRefBeginning ? preV : v), graph);
if( offRefBeginning ) {
path = new Path<SeqVertex>(path, graph.getEdge(preV, v));
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(preV, v));
}
path = new Path<SeqVertex>(path, graph.getEdge(v, v2Alt));
path = new Path<SeqVertex>(path, graph.getEdge(v2Alt, v3));
path = new Path<SeqVertex>(path, graph.getEdge(v3, v4Ref));
path = new Path<SeqVertex>(path, graph.getEdge(v4Ref, v5));
path = new Path<SeqVertex>(path, graph.getEdge(v5, v6Alt));
path = new Path<SeqVertex>(path, graph.getEdge(v6Alt, v7));
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(v, v2Alt));
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(v2Alt, v3));
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(v3, v4Ref));
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(v4Ref, v5));
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(v5, v6Alt));
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(v6Alt, v7));
if( offRefEnding ) {
path = new Path<SeqVertex>(path, graph.getEdge(v7,postV));
path = new Path<SeqVertex,BaseEdge>(path, graph.getEdge(v7,postV));
}
// Construct the actual cigar string implied by the test path
@ -373,7 +376,9 @@ public class KBestPathsUnitTest extends BaseTest {
expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I));
}
Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");
Assert.assertEquals(path.calculateCigar(ref.getBytes()).toString(),
AlignmentUtils.consolidateCigar(expectedCigar).toString(),
"Cigar string mismatch: ref = " + ref + " alt " + new String(path.getBases()));
}
@Test(enabled = !DEBUG)
@ -389,43 +394,46 @@ public class KBestPathsUnitTest extends BaseTest {
graph.addEdges(new BaseEdge(true, 1), top, ref, bot);
graph.addEdges(new BaseEdge(false, 1), top, alt, bot);
final KBestPaths<SeqVertex> pathFinder = new KBestPaths<SeqVertex>();
final List<Path<SeqVertex>> paths = pathFinder.getKBestPaths(graph, top, bot);
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<SeqVertex,BaseEdge>();
final List<Path<SeqVertex,BaseEdge>> paths = pathFinder.getKBestPaths(graph, top, bot);
Assert.assertEquals(paths.size(), 2);
final Path<SeqVertex> refPath = paths.get(0);
final Path<SeqVertex> altPath = paths.get(1);
final Path<SeqVertex,BaseEdge> refPath = paths.get(0);
final Path<SeqVertex,BaseEdge> altPath = paths.get(1);
Assert.assertEquals(refPath.calculateCigar().toString(), "10M");
Assert.assertEquals(altPath.calculateCigar().toString(), "1M3I5M3D1M");
final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString();
Assert.assertEquals(refPath.calculateCigar(refString.getBytes()).toString(), "10M");
Assert.assertEquals(altPath.calculateCigar(refString.getBytes()).toString(), "1M3I5M3D1M");
}
@Test(enabled = !DEBUG)
public void testHardSWPath() {
// Construct the assembly graph
SeqGraph graph = new SeqGraph();
final SeqVertex top = new SeqVertex( "NNN");
final SeqVertex bot = new SeqVertex( "NNN");
final SeqVertex top = new SeqVertex( "NNN" );
final SeqVertex bot = new SeqVertex( "NNN" );
final SeqVertex alt = new SeqVertex( "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" );
final SeqVertex ref = new SeqVertex( "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" );
graph.addVertices(top, bot, alt, ref);
graph.addEdges(new BaseEdge(true, 1), top, ref, bot);
graph.addEdges(new BaseEdge(false, 1), top, alt, bot);
final KBestPaths<SeqVertex> pathFinder = new KBestPaths<SeqVertex>();
final List<Path<SeqVertex>> paths = pathFinder.getKBestPaths(graph, top, bot);
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<SeqVertex,BaseEdge>();
final List<Path<SeqVertex,BaseEdge>> paths = pathFinder.getKBestPaths(graph, top, bot);
Assert.assertEquals(paths.size(), 2);
final Path<SeqVertex> refPath = paths.get(0);
final Path<SeqVertex> altPath = paths.get(1);
final Path<SeqVertex,BaseEdge> refPath = paths.get(0);
final Path<SeqVertex,BaseEdge> altPath = paths.get(1);
logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar());
logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar());
final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString();
Assert.assertEquals(refPath.calculateCigar().toString(), "51M");
Assert.assertEquals(altPath.calculateCigar().toString(), "3M6I48M");
logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar(refString.getBytes()));
logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar(refString.getBytes()));
Assert.assertEquals(refPath.calculateCigar(refString.getBytes()).toString(), "51M");
Assert.assertEquals(altPath.calculateCigar(refString.getBytes()).toString(), "3M6I48M");
}
// -----------------------------------------------------------------
@ -466,30 +474,87 @@ public class KBestPathsUnitTest extends BaseTest {
// Construct the assembly graph
SeqGraph graph = new SeqGraph();
SeqVertex top = new SeqVertex("");
final int padSize = 0;
SeqVertex top = new SeqVertex(Utils.dupString("N", padSize));
SeqVertex ref = new SeqVertex(prefix + refMid + end);
SeqVertex alt = new SeqVertex(prefix + altMid + end);
SeqVertex bot = new SeqVertex("");
SeqVertex bot = new SeqVertex(Utils.dupString("N", padSize));
graph.addVertices(top, ref, alt, bot);
graph.addEdges(new BaseEdge(true, 1), top, ref, bot);
graph.addEdges(new BaseEdge(false, 1), top, alt, bot);
// Construct the test path
Path<SeqVertex> path = Path.makePath(Arrays.asList(top, alt, bot), graph);
Path<SeqVertex,BaseEdge> path = Path.makePath(Arrays.asList(top, alt, bot), graph);
Cigar expected = new Cigar();
expected.add(new CigarElement(padSize, CigarOperator.M));
if ( ! prefix.equals("") ) expected.add(new CigarElement(prefix.length(), CigarOperator.M));
for ( final CigarElement elt : TextCigarCodec.getSingleton().decode(midCigar).getCigarElements() ) expected.add(elt);
if ( ! end.equals("") ) expected.add(new CigarElement(end.length(), CigarOperator.M));
expected.add(new CigarElement(padSize, CigarOperator.M));
expected = AlignmentUtils.consolidateCigar(expected);
final Cigar pathCigar = path.calculateCigar();
final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString();
final Cigar pathCigar = path.calculateCigar(refString.getBytes());
logger.warn("diffs: " + ref + " vs. " + alt + " cigar " + midCigar);
logger.warn("Path " + path + " with cigar " + pathCigar);
logger.warn("Expected cigar " + expected);
Assert.assertEquals(pathCigar, expected, "Cigar mismatch");
Assert.assertEquals(pathCigar, expected, "Cigar mismatch: ref = " + refString + " vs alt = " + new String(path.getBases()));
}
@Test(enabled = !DEBUG)
public void testLeftAlignCigarSequentially() {
String preRefString = "GATCGATCGATC";
String postRefString = "TTT";
String refString = "ATCGAGGAGAGCGCCCCG";
String indelString1 = "X";
String indelString2 = "YZ";
int refIndel1 = 10;
int refIndel2 = 12;
for ( final int indelSize1 : Arrays.asList(1, 2, 3, 4) ) {
for ( final int indelOp1 : Arrays.asList(1, -1) ) {
for ( final int indelSize2 : Arrays.asList(1, 2, 3, 4) ) {
for ( final int indelOp2 : Arrays.asList(1, -1) ) {
Cigar expectedCigar = new Cigar();
expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M));
expectedCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D)));
expectedCigar.add(new CigarElement((indelOp1 < 0 ? refIndel1 - indelSize1 : refIndel1), CigarOperator.M));
expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M));
expectedCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D)));
expectedCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2 - indelSize2) * 2 : refIndel2 * 2), CigarOperator.M));
expectedCigar.add(new CigarElement(refString.length(), CigarOperator.M));
Cigar givenCigar = new Cigar();
givenCigar.add(new CigarElement(refString.length() + refIndel1/2, CigarOperator.M));
givenCigar.add(new CigarElement(indelSize1, (indelOp1 > 0 ? CigarOperator.I : CigarOperator.D)));
givenCigar.add(new CigarElement((indelOp1 < 0 ? (refIndel1/2 - indelSize1) : refIndel1/2) + refString.length() + refIndel2/2 * 2, CigarOperator.M));
givenCigar.add(new CigarElement(indelSize2 * 2, (indelOp2 > 0 ? CigarOperator.I : CigarOperator.D)));
givenCigar.add(new CigarElement((indelOp2 < 0 ? (refIndel2/2 - indelSize2) * 2 : refIndel2/2 * 2) + refString.length(), CigarOperator.M));
String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString;
String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString;
Cigar calculatedCigar = Path.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0);
Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!");
}
}
}
}
}
@Test(enabled = true)
public void testLeftAlignCigarSequentiallyAdjacentID() {
final String ref = "GTCTCTCTCTCTCTCTCTATATATATATATATATTT";
final String hap = "GTCTCTCTCTCTCTCTCTCTCTATATATATATATTT";
final Cigar originalCigar = TextCigarCodec.getSingleton().decode("18M4I12M4D2M");
final Cigar result = Path.leftAlignCigarSequentially(originalCigar, ref.getBytes(), hap.getBytes(), 0, 0);
logger.warn("Result is " + result);
Assert.assertEquals(originalCigar.getReferenceLength(), result.getReferenceLength(), "Reference lengths are different");
}
}

View File

@ -0,0 +1,163 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.util.*;
public class LowWeightChainPrunerUnitTest extends BaseTest {
@DataProvider(name = "pruneChainsData")
public Object[][] makePruneChainsData() {
List<Object[]> tests = new ArrayList<>();
final SeqVertex v1 = new SeqVertex("A");
final SeqVertex v2 = new SeqVertex("C");
final SeqVertex v3 = new SeqVertex("G");
final SeqVertex v4 = new SeqVertex("T");
final SeqVertex v5 = new SeqVertex("AA");
final SeqVertex v6 = new SeqVertex("CC");
for ( final int edgeWeight : Arrays.asList(1, 2, 3) ) {
for ( final int pruneFactor : Arrays.asList(1, 2, 3, 4) ) {
for ( final boolean isRef : Arrays.asList(true, false)) {
{ // just an isolated chain
final int nExpected = edgeWeight < pruneFactor && ! isRef ? 3 : 0;
SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3);
graph.addEdges(new BaseEdge(isRef, edgeWeight), v1, v2, v3);
tests.add(new Object[]{"combinatorial", graph, pruneFactor, nExpected > 0 ? Collections.emptySet() : graph.vertexSet()});
}
}
}
}
{ // connects to ref chain
SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3);
graph.addVertices(v4, v5);
graph.addEdges(new BaseEdge(true, 1), v4, v5);
graph.addEdges(new BaseEdge(false, 1), v4, v1, v2, v3, v5);
tests.add(new Object[]{"bad internal branch", graph, 2, new HashSet<>(Arrays.asList(v4, v5))});
}
{ // has bad cycle
SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3, v4);
graph.addEdges(new BaseEdge(false, 1), v4, v1, v2, v3, v1);
// note that we'll remove v4 because it's low weight
tests.add(new Object[]{"has bad cycle", graph, 2, Collections.emptySet()});
}
{ // has good cycle
SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3, v4);
graph.addEdges(new BaseEdge(false, 3), v4, v1, v2, v3, v1);
// note that we'll remove v4 because it's low weight
tests.add(new Object[]{"has good cycle", graph, 2, graph.vertexSet()});
}
{ // has branch
SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3, v4, v5, v6);
graph.addEdges(new BaseEdge(false, 1), v1, v2, v3, v4, v6);
graph.addEdges(new BaseEdge(false, 1), v1, v2, v3, v5, v6);
tests.add(new Object[]{"has two bad branches", graph, 2, Collections.emptySet()});
}
{ // middle vertex above threshold => no one can be removed
SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3, v4, v5);
graph.addEdges(new BaseEdge(false, 1), v1, v2);
graph.addEdges(new BaseEdge(false, 3), v2, v3);
graph.addEdges(new BaseEdge(false, 1), v3, v4, v5);
tests.add(new Object[]{"middle vertex above factor", graph, 2, graph.vertexSet()});
}
{ // the branching node has value > pruneFactor
SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3, v4, v5, v6);
graph.addEdges(new BaseEdge(false, 3), v1, v2);
graph.addEdges(new BaseEdge(false, 3), v2, v3);
graph.addEdges(new BaseEdge(false, 1), v3, v4, v6);
graph.addEdges(new BaseEdge(false, 3), v2, v5, v6);
tests.add(new Object[]{"branch node greater than pruneFactor", graph, 2, graph.vertexSet()});
}
{ // A single isolated chain with weights all below pruning should be pruned
SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3, v4, v5);
graph.addEdges(new BaseEdge(false, 1), v1, v2, v3);
graph.addEdges(new BaseEdge(false, 5), v4, v5);
tests.add(new Object[]{"isolated chain", graph, 2, new LinkedHashSet<>(Arrays.asList(v4, v5))});
}
{ // A chain with weights all below pruning should be pruned, even if it connects to another good chain
SeqGraph graph = new SeqGraph();
graph.addVertices(v1, v2, v3, v4, v5, v6);
graph.addEdges(new BaseEdge(false, 1), v1, v2, v3, v5);
graph.addEdges(new BaseEdge(false, 5), v4, v5, v6);
tests.add(new Object[]{"bad chain branching into good one", graph, 2, new HashSet<>(Arrays.asList(v4, v5, v6))});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "pruneChainsData", enabled = true)
public void testPruneChains(final String name, final SeqGraph graph, final int pruneFactor, final Set<SeqVertex> remainingVertices) {
final Set<SeqVertex> copy = new HashSet<>(remainingVertices);
// graph.printGraph(new File("in.dot"), 0);
final LowWeightChainPruner<SeqVertex, BaseEdge> pruner = new LowWeightChainPruner<>(pruneFactor);
pruner.pruneLowWeightChains(graph);
// graph.printGraph(new File("out.dot"), 0);
Assert.assertEquals(graph.vertexSet(), copy);
}
}

View File

@ -0,0 +1,103 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class MultiSampleEdgeUnitTest extends BaseTest {
@DataProvider(name = "MultiplicityData")
public Object[][] makeMultiplicityData() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Integer> countsPerSample = Arrays.asList(0, 1, 2, 3, 4, 5);
for ( final int nSamples : Arrays.asList(1, 2, 3, 4, 5)) {
for ( final List<Integer> perm : Utils.makePermutations(countsPerSample, nSamples, false) ) {
tests.add(new Object[]{perm});
}
}
return tests.toArray(new Object[][]{});
}
/**
* Example testng test using MyDataProvider
*/
@Test(dataProvider = "MultiplicityData")
public void testMultiplicity(final List<Integer> countsPerSample) {
final MultiSampleEdge edge = new MultiSampleEdge(false, 0);
Assert.assertEquals(edge.getMultiplicity(), 0);
Assert.assertEquals(edge.getPruningMultiplicity(), 0);
int total = 0;
for ( int i = 0; i < countsPerSample.size(); i++ ) {
int countForSample = 0;
for ( int count = 0; count < countsPerSample.get(i); count++ ) {
edge.incMultiplicity(1);
total++;
countForSample++;
Assert.assertEquals(edge.getMultiplicity(), total);
Assert.assertEquals(edge.getCurrentSingleSampleMultiplicity(), countForSample);
}
edge.flushSingleSampleMultiplicity();
}
final int max = MathUtils.arrayMax(ArrayUtils.toPrimitive(countsPerSample.toArray(new Integer[countsPerSample.size()])));
Assert.assertEquals(edge.getMultiplicity(), total);
Assert.assertEquals(edge.getPruningMultiplicity(), max);
Assert.assertEquals(edge.getMaxSingleSampleMultiplicity(), max);
}
}

View File

@ -0,0 +1,80 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import net.sf.samtools.Cigar;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;
public class PathUnitTest extends BaseTest {
@Test(enabled = true)
public void testAlignReallyLongDeletion() {
final String ref = "ATGGTGGCTCATACCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGAACATCACCTGAGGCCAGGAGTTCAAAACCAGCCTGGCTAACATAGCAAAACCCCATCTCTAATGAAAATACAAAAATTAGCTGGGTGTGGTGGTGTCCGCCTGTAGTCCCAGCTACTCAGGAGACTAAGGCATGAGAATCACTTGAACCCAGGATGCAGAGGCTGTAGTGAGCCGAGATTGCACCACGGCTGCACTCCAGCCTGGGCAACAGAGCGAGACTCTGTCTCAAATAAAATAGCGTAACGTAACATAACATAACATAACATAACATAACATAACATAACATAACATAACATAACATAACACAACAACAAAATAAAATAACATAAATCATGTTGTTAGGAAAAAAATCAGTTATGCAGCTACATGCTATTTACAAGAGATATACCTTAAAATATAAGACACAGAGGCCGGGCGCGGTAGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAAGCGGATCATGAGGTCAGGAGATCGAGACCATCC";
final String hap = "ATGGTGGCTCATACCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAAGCGGATCATGAGGTCAGGAGATCGAGACCATCCT";
final SeqGraph graph = new SeqGraph();
final SeqVertex v = new SeqVertex(hap);
graph.addVertex(v);
final Path<SeqVertex,BaseEdge> path = new Path<SeqVertex,BaseEdge>(v, graph);
final Cigar cigar = path.calculateCigar(ref.getBytes());
Assert.assertNull(cigar, "Should have failed gracefully");
}
@Test(enabled = true)
public void testAlignReallyLongDeletion2() {
final String ref = "CGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTGACCTCAGGTGATCCACTCGCCTCGGTCTCCCAAAGTGTTGGGATTACAGGCATGAACCACTGCACCTGGCCTAGTGTTTGGGAAAACTATACTAGGAAAAGAATAGTTGCTTTAAGTCATTCTTTGATTATTCTGAGAATTGGCATATAGCTGCCATTATAACCTACTTTTGCTAAATATAATAATAATAATCATTATTTTTATTTTTTGAGACAGGGTCTTGTTTTGTCACCCCGGCTGGAGTGAAGTGGCGCAATCTCGGCTCACTGCAACCTCCACCTCCGGGTGCAAGCAATTCTCCTGCCTCAGCCTCTTGAGTAGCTAGGATTACAGGCACAAGCCATCATGCCCAGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAGGT";
final String hap = "CGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAGGT";
final SeqGraph graph = new SeqGraph();
final SeqVertex v = new SeqVertex(hap);
graph.addVertex(v);
final Path<SeqVertex,BaseEdge> path = new Path<SeqVertex,BaseEdge>(v, graph);
final Cigar cigar = path.calculateCigar(ref.getBytes());
Assert.assertEquals(cigar.toString(), "48M419D30M");
}
}

View File

@ -280,16 +280,15 @@ public class SeqGraphUnitTest extends BaseTest {
all.addEdges(pre2, top, middle2, bottom, tail2);
final SeqGraph expected = new SeqGraph();
SeqVertex newPre1 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "C");
SeqVertex newPre2 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "G");
final SeqVertex newTop = new SeqVertex("TA");
final SeqVertex newMiddle1 = new SeqVertex("G");
final SeqVertex newMiddle2 = new SeqVertex("T");
final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString());
final SeqVertex newTop = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES));
final SeqVertex newTopDown1 = new SeqVertex("G");
final SeqVertex newTopDown2 = new SeqVertex("C");
final SeqVertex newTopBottomMerged = new SeqVertex("TA");
expected.addVertices(newTop, newTopDown1, newTopDown2, newTopBottomMerged, newMiddle1, newMiddle2, newBottom, tail1, tail2);
expected.addEdges(newTop, newTopDown1, newTopBottomMerged, newMiddle1, newBottom, tail1);
expected.addEdges(newTop, newTopDown2, newTopBottomMerged, newMiddle2, newBottom, tail2);
expected.addVertices(newPre1, newPre2, newTop, newMiddle1, newMiddle2, newBottom, tail1, tail2);
expected.addEdges(newPre1, newTop, newMiddle1, newBottom, tail1);
expected.addEdges(newPre2, newTop, newMiddle2, newBottom, tail2);
tests.add(new Object[]{all.clone(), expected.clone()});
}

View File

@ -227,8 +227,8 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest {
}
final Set<String> haplotypes = new HashSet<String>();
final List<Path<SeqVertex>> originalPaths = new KBestPaths<SeqVertex>().getKBestPaths((SeqGraph)graph.clone());
for ( final Path<SeqVertex> path : originalPaths )
final List<Path<SeqVertex,BaseEdge>> originalPaths = new KBestPaths<SeqVertex,BaseEdge>().getKBestPaths((SeqGraph)graph.clone());
for ( final Path<SeqVertex,BaseEdge> path : originalPaths )
haplotypes.add(new String(path.getBases()));
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
@ -238,8 +238,8 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest {
splitter.updateGraph(top, bot);
if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0);
final List<Path<SeqVertex>> splitPaths = new KBestPaths<SeqVertex>().getKBestPaths(graph);
for ( final Path<SeqVertex> path : splitPaths ) {
final List<Path<SeqVertex,BaseEdge>> splitPaths = new KBestPaths<SeqVertex,BaseEdge>().getKBestPaths(graph);
for ( final Path<SeqVertex,BaseEdge> path : splitPaths ) {
final String h = new String(path.getBases());
Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
}

View File

@ -0,0 +1,213 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
import java.util.*;
public class ReadThreadingAssemblerUnitTest extends BaseTest {
private final static boolean DEBUG = false;
private static class TestAssembler {
final ReadThreadingAssembler assembler;
Haplotype refHaplotype;
final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
private TestAssembler(final int kmerSize) {
this.assembler = new ReadThreadingAssembler(100000, Arrays.asList(kmerSize));
assembler.setJustReturnRawGraph(true);
assembler.setPruneFactor(0);
}
public void addSequence(final byte[] bases, final boolean isRef) {
if ( isRef ) {
refHaplotype = new Haplotype(bases, true);
} else {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte)30,bases.length), bases.length + "M");
reads.add(read);
}
}
public SeqGraph assemble() {
assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests
assembler.setDebugGraphTransformations(true);
final SeqGraph graph = assembler.assemble(reads, refHaplotype).get(0);
if ( DEBUG ) graph.printGraph(new File("test.dot"), 0);
return graph;
}
}
private void assertLinearGraph(final TestAssembler assembler, final String seq) {
final SeqGraph graph = assembler.assemble();
graph.simplifyGraph();
Assert.assertEquals(graph.vertexSet().size(), 1);
Assert.assertEquals(graph.vertexSet().iterator().next().getSequenceString(), seq);
}
private void assertSingleBubble(final TestAssembler assembler, final String one, final String two) {
final SeqGraph graph = assembler.assemble();
graph.simplifyGraph();
List<Path<SeqVertex,BaseEdge>> paths = new KBestPaths<SeqVertex,BaseEdge>().getKBestPaths(graph);
Assert.assertEquals(paths.size(), 2);
final Set<String> expected = new HashSet<String>(Arrays.asList(one, two));
for ( final Path<SeqVertex,BaseEdge> path : paths ) {
final String seq = new String(path.getBases());
Assert.assertTrue(expected.contains(seq));
expected.remove(seq);
}
}
@Test(enabled = ! DEBUG)
public void testRefCreation() {
final String ref = "ACGTAACCGGTT";
final TestAssembler assembler = new TestAssembler(3);
assembler.addSequence(ref.getBytes(), true);
assertLinearGraph(assembler, ref);
}
@Test(enabled = ! DEBUG)
public void testRefNonUniqueCreation() {
final String ref = "GAAAAT";
final TestAssembler assembler = new TestAssembler(3);
assembler.addSequence(ref.getBytes(), true);
assertLinearGraph(assembler, ref);
}
@Test(enabled = ! DEBUG)
public void testRefAltCreation() {
final TestAssembler assembler = new TestAssembler(3);
final String ref = "ACAACTGA";
final String alt = "ACAGCTGA";
assembler.addSequence(ref.getBytes(), true);
assembler.addSequence(alt.getBytes(), false);
assertSingleBubble(assembler, ref, alt);
}
@Test(enabled = ! DEBUG)
public void testPartialReadsCreation() {
final TestAssembler assembler = new TestAssembler(3);
final String ref = "ACAACTGA";
final String alt1 = "ACAGCT";
final String alt2 = "GCTGA";
assembler.addSequence(ref.getBytes(), true);
assembler.addSequence(alt1.getBytes(), false);
assembler.addSequence(alt2.getBytes(), false);
assertSingleBubble(assembler, ref, "ACAGCTGA");
}
@Test(enabled = ! DEBUG)
public void testStartInMiddle() {
final TestAssembler assembler = new TestAssembler(3);
final String ref = "CAAAATG";
final String read = "AAATG";
assembler.addSequence(ref.getBytes(), true);
assembler.addSequence(read.getBytes(), false);
assertLinearGraph(assembler, ref);
}
@Test(enabled = ! DEBUG)
public void testStartInMiddleWithBubble() {
final TestAssembler assembler = new TestAssembler(3);
final String ref = "CAAAATGGGG";
final String read = "AAATCGGG";
assembler.addSequence(ref.getBytes(), true);
assembler.addSequence(read.getBytes(), false);
assertSingleBubble(assembler, ref, "CAAAATCGGG");
}
@Test(enabled = ! DEBUG)
public void testNoGoodStarts() {
final TestAssembler assembler = new TestAssembler(3);
final String ref = "CAAAATGGGG";
final String read = "AAATCGGG";
assembler.addSequence(ref.getBytes(), true);
assembler.addSequence(read.getBytes(), false);
assertSingleBubble(assembler, ref, "CAAAATCGGG");
}
@Test(enabled = !DEBUG)
public void testCreateWithBasesBeforeRefSource() {
final TestAssembler assembler = new TestAssembler(3);
final String ref = "ACTG";
final String read = "CTGGGACT";
assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(ref), true);
assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read), false);
assertLinearGraph(assembler, "ACTGGGACT");
}
@Test(enabled = !DEBUG)
public void testSingleIndelAsDoubleIndel3Reads() {
final TestAssembler assembler = new TestAssembler(25);
// The single indel spans two repetitive structures
final String ref = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCTCTCTGTGTGTGTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG";
final String read1 = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCT----------GTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG";
final String read2 = "GTTTTTCCTAGGCAAATGGTTTCTATAAAATTATGTGTGTGTGTCTCT----------GTGTGTGTGTGTGTGTGTATACCTAATCTCACACTCTTTTTTCTGG";
assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(ref), true);
assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read1), false);
assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read2), false);
final SeqGraph graph = assembler.assemble();
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<SeqVertex,BaseEdge>();
final List<Path<SeqVertex,BaseEdge>> paths = pathFinder.getKBestPaths(graph);
Assert.assertEquals(paths.size(), 2);
final byte[] refPath = paths.get(0).getBases().length == ref.length() ? paths.get(0).getBases() : paths.get(1).getBases();
final byte[] altPath = paths.get(0).getBases().length == ref.length() ? paths.get(1).getBases() : paths.get(0).getBases();
Assert.assertEquals(refPath, ReadThreadingGraphUnitTest.getBytes(ref));
Assert.assertEquals(altPath, ReadThreadingGraphUnitTest.getBytes(read1));
}
}

View File

@ -0,0 +1,191 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
import java.util.*;
public class ReadThreadingGraphUnitTest extends BaseTest {
private final static boolean DEBUG = false;
public static byte[] getBytes(final String alignment) {
return alignment.replace("-","").getBytes();
}
private void assertNonUniques(final ReadThreadingGraph assembler, String ... nonUniques) {
final Set<String> actual = new HashSet<>();
assembler.buildGraphIfNecessary();
for ( final Kmer kmer : assembler.getNonUniqueKmers() ) actual.add(kmer.baseString());
final Set<String> expected = new HashSet<>(Arrays.asList(nonUniques));
Assert.assertEquals(actual, expected);
}
@Test(enabled = ! DEBUG)
public void testNonUniqueMiddle() {
final ReadThreadingGraph assembler = new ReadThreadingGraph(3);
final String ref = "GACACACAGTCA";
final String read1 = "GACAC---GTCA";
final String read2 = "CAC---GTCA";
assembler.addSequence(getBytes(ref), true);
assembler.addSequence(getBytes(read1), false);
assembler.addSequence(getBytes(read2), false);
assertNonUniques(assembler, "ACA", "CAC");
}
@Test(enabled = ! DEBUG)
public void testReadsCreateNonUnique() {
final ReadThreadingGraph assembler = new ReadThreadingGraph(3);
final String ref = "GCAC--GTCA"; // CAC is unique
final String read1 = "GCACACGTCA"; // makes CAC non unique because it has a duplication
final String read2 = "CACGTCA"; // shouldn't be allowed to match CAC as start
assembler.addSequence(getBytes(ref), true);
assembler.addSequence(getBytes(read1), false);
assembler.addSequence(getBytes(read2), false);
// assembler.convertToSequenceGraph().printGraph(new File("test.dot"), 0);
assertNonUniques(assembler, "CAC");
//assertSingleBubble(assembler, ref, "CAAAATCGGG");
}
@Test(enabled = ! DEBUG)
public void testCountingOfStartEdges() {
final ReadThreadingGraph assembler = new ReadThreadingGraph(3);
final String ref = "NNNGTCAAA"; // ref has some bases before start
final String read1 = "GTCAAA"; // starts at first non N base
assembler.addSequence(getBytes(ref), true);
assembler.addSequence(getBytes(read1), false);
assembler.buildGraphIfNecessary();
// assembler.printGraph(new File("test.dot"), 0);
for ( final MultiSampleEdge edge : assembler.edgeSet() ) {
final MultiDeBruijnVertex source = assembler.getEdgeSource(edge);
final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge);
final boolean headerVertex = source.getSuffix() == 'N' || target.getSuffix() == 'N';
if ( headerVertex ) {
Assert.assertEquals(edge.getMultiplicity(), 1, "Bases in the unique reference header should have multiplicity of 1");
} else {
Assert.assertEquals(edge.getMultiplicity(), 2, "Should have multiplicity of 2 for any edge outside the ref header but got " + edge + " " + source + " -> " + target);
}
}
}
@Test(enabled = !DEBUG)
public void testCountingOfStartEdgesWithMultiplePrefixes() {
final ReadThreadingGraph assembler = new ReadThreadingGraph(3);
assembler.increaseCountsThroughBranches = true;
final String ref = "NNNGTCAXX"; // ref has some bases before start
final String alt1 = "NNNCTCAXX"; // alt1 has SNP right after N
final String read = "TCAXX"; // starts right after SNP, but merges right before branch
assembler.addSequence(getBytes(ref), true);
assembler.addSequence(getBytes(alt1), false);
assembler.addSequence(getBytes(read), false);
assembler.buildGraphIfNecessary();
assembler.printGraph(new File("test.dot"), 0);
final List<String> oneCountVertices = Arrays.asList("NNN", "NNG", "NNC", "NGT", "NCT");
final List<String> threeCountVertices = Arrays.asList("CAX", "AXX");
for ( final MultiSampleEdge edge : assembler.edgeSet() ) {
final MultiDeBruijnVertex source = assembler.getEdgeSource(edge);
final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge);
final int expected = oneCountVertices.contains(target.getSequenceString()) ? 1 : (threeCountVertices.contains(target.getSequenceString()) ? 3 : 2);
Assert.assertEquals(edge.getMultiplicity(), expected, "Bases at edge " + edge + " from " + source + " to " + target + " has bad multiplicity");
}
}
// TODO -- update to use determineKmerSizeAndNonUniques directly
// @DataProvider(name = "KmerSizeData")
// public Object[][] makeKmerSizeDataProvider() {
// List<Object[]> tests = new ArrayList<Object[]>();
//
// // this functionality can be adapted to provide input data for whatever you might want in your data
// tests.add(new Object[]{3, 3, 3, Arrays.asList("ACG"), Arrays.asList()});
// tests.add(new Object[]{3, 4, 3, Arrays.asList("CAGACG"), Arrays.asList()});
//
// tests.add(new Object[]{3, 3, 3, Arrays.asList("AAAAC"), Arrays.asList("AAA")});
// tests.add(new Object[]{3, 4, 4, Arrays.asList("AAAAC"), Arrays.asList()});
// tests.add(new Object[]{3, 5, 4, Arrays.asList("AAAAC"), Arrays.asList()});
// tests.add(new Object[]{3, 4, 3, Arrays.asList("CAAA"), Arrays.asList()});
// tests.add(new Object[]{3, 4, 4, Arrays.asList("CAAAA"), Arrays.asList()});
// tests.add(new Object[]{3, 5, 4, Arrays.asList("CAAAA"), Arrays.asList()});
// tests.add(new Object[]{3, 5, 5, Arrays.asList("ACGAAAAACG"), Arrays.asList()});
//
// for ( int maxSize = 3; maxSize < 20; maxSize++ ) {
// for ( int dupSize = 3; dupSize < 20; dupSize++ ) {
// final int expectedSize = Math.min(maxSize, dupSize);
// final String dup = Utils.dupString("C", dupSize);
// final List<String> nonUnique = dupSize > maxSize ? Arrays.asList(Utils.dupString("C", maxSize)) : Collections.<String>emptyList();
// tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("ACGT", "A" + dup + "GT"), nonUnique});
// tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("A" + dup + "GT", "ACGT"), nonUnique});
// }
// }
//
// return tests.toArray(new Object[][]{});
// }
//
// /**
// * Example testng test using MyDataProvider
// */
// @Test(dataProvider = "KmerSizeData")
// public void testDynamicKmerSizing(final int min, final int max, final int expectKmer, final List<String> seqs, final List<String> expectedNonUniques) {
// final ReadThreadingGraph assembler = new ReadThreadingGraph(min, max);
// for ( String seq : seqs ) assembler.addSequence(seq.getBytes(), false);
// assembler.buildGraphIfNecessary();
// Assert.assertEquals(assembler.getKmerSize(), expectKmer);
// assertNonUniques(assembler, expectedNonUniques.toArray(new String[]{}));
// }
}

View File

@ -0,0 +1,80 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.Test;
public class SequenceForKmersUnitTest extends BaseTest {
@Test
public void testNoCount() {
final byte[] seq = "ACGT".getBytes();
final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, null, true);
Assert.assertEquals(sk.name, "foo");
Assert.assertEquals(sk.sequence, seq);
Assert.assertEquals(sk.start, 0);
Assert.assertEquals(sk.stop, seq.length);
Assert.assertEquals(sk.isRef, true);
for ( int i = 0; i < seq.length; i++ )
Assert.assertEquals(sk.getCount(i), 1);
}
@Test
public void testWithCounts() {
final int len = 256;
final int[] counts = new int[len];
for ( int i = 0; i < len; i++ ) counts[i] = i;
final byte[] seq = Utils.dupBytes((byte)'A', len);
final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, counts, true);
for ( int i = 0; i < seq.length; i++ )
Assert.assertEquals(sk.getCount(i), i);
}
}

View File

@ -0,0 +1,108 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.List;
public class ConstrainedMateFixingManagerUnitTest extends BaseTest {
private static SAMFileHeader header;
private static GenomeLocParser genomeLocParser;
@BeforeClass
public void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 100);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
}
@Test
public void testSecondaryAlignmentsDoNotInterfere() {
final List<GATKSAMRecord> properReads = ArtificialSAMUtils.createPair(header, "foo", 1, 10, 30, true, false);
final GATKSAMRecord read1 = properReads.get(0);
read1.setAlignmentStart(8); // move the read
read1.setFlags(99); // first in proper pair, mate negative strand
final GATKSAMRecord read2Primary = properReads.get(1);
read2Primary.setFlags(147); // second in pair, mate unmapped, not primary alignment
Assert.assertEquals(read1.getInferredInsertSize(), 21);
final GATKSAMRecord read2NonPrimary = new GATKSAMRecord(read2Primary);
read2NonPrimary.setFlags(393); // second in proper pair, on reverse strand
final ConstrainedMateFixingManager manager = new ConstrainedMateFixingManager(null, genomeLocParser, 1000, 1000, 1000);
manager.addRead(read1, true, false);
manager.addRead(read2NonPrimary, false, false);
manager.addRead(read2Primary, false, false);
Assert.assertEquals(manager.getNReadsInQueue(), 3);
for ( final SAMRecord read : manager.getReadsInQueueForTesting() ) {
if ( read.getFirstOfPairFlag() ) {
Assert.assertEquals(read.getFlags(), 99);
Assert.assertEquals(read.getInferredInsertSize(), 23);
} else if ( read.getNotPrimaryAlignmentFlag() ) {
Assert.assertEquals(read.getFlags(), 393);
Assert.assertEquals(read.getInferredInsertSize(), -21);
} else {
Assert.assertEquals(read.getFlags(), 147);
Assert.assertEquals(read.getInferredInsertSize(), -23);
}
}
}
}

View File

@ -75,26 +75,37 @@ public class ReadGroupCovariateUnitTest {
final String expected = "SAMPLE.1";
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID");
rg.setPlatformUnit(expected);
runTest(rg, expected);
runTest(rg, expected, covariate);
}
@Test(enabled = true)
public void testMissingPlatformUnit() {
final String expected = "MY.7";
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected);
runTest(rg, expected);
runTest(rg, expected, covariate);
}
private void runTest(GATKSAMReadGroupRecord rg, String expected) {
@Test(enabled = true)
public void testForceReadgroup() {
final RecalibrationArgumentCollection forcedRAC = new RecalibrationArgumentCollection();
forcedRAC.FORCE_READGROUP = "FOO";
final ReadGroupCovariate forcedCovariate = new ReadGroupCovariate();
forcedCovariate.initialize(forcedRAC);
final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("NOT_FOO");
runTest(rg, "FOO", forcedCovariate);
}
private static void runTest(final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) {
GATKSAMRecord read = ReadUtils.createRandomRead(10);
read.setReadGroup(rg);
ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1);
covariate.recordValues(read, readCovariates);
verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected);
verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate);
}
private void verifyCovariateArray(int[][] values, String expected) {
private static void verifyCovariateArray(final int[][] values, final String expected, final ReadGroupCovariate covariate) {
for (int[] value : values) {
String actual = covariate.formatKey(value[0]);
Assert.assertEquals(actual, expected);

View File

@ -0,0 +1,259 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.utils.smithwaterman;
import net.sf.samtools.TextCigarCodec;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class GlobalEdgeGreedySWPairwiseAlignmentUnitTest extends BaseTest {
private final static boolean DEBUG = false;
@Test(enabled = !DEBUG)
public void testReadAlignedToRefComplexAlignment() {
final String reference = "AAAGGACTGACTG";
final String read = "ACTGACTGACTG";
final GlobalEdgeGreedySWPairwiseAlignment sw = new GlobalEdgeGreedySWPairwiseAlignment(reference.getBytes(), read.getBytes());
Assert.assertEquals(sw.getCigar().toString(), "1M1D11M");
}
@Test(enabled = !DEBUG)
public void testIndelsAtStartAndEnd() {
final String match = "CCCCC";
final String reference = "AAA" + match;
final String read = match + "GGG";
final int expectedStart = 0;
final String expectedCigar = "3D5M3I";
final GlobalEdgeGreedySWPairwiseAlignment sw = new GlobalEdgeGreedySWPairwiseAlignment(reference.getBytes(), read.getBytes());
Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart);
Assert.assertEquals(sw.getCigar().toString(), expectedCigar);
}
@Test(enabled = !DEBUG)
public void testDegenerateAlignmentWithIndelsAtBothEnds() {
logger.warn("testDegenerateAlignmentWithIndelsAtBothEnds");
final String ref = "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA";
final String alt = "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA";
final int expectedStart = 0;
final String expectedCigar = "6I45M";
final GlobalEdgeGreedySWPairwiseAlignment sw = new GlobalEdgeGreedySWPairwiseAlignment(ref.getBytes(), alt.getBytes(), SWParameterSet.STANDARD_NGS);
Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart);
Assert.assertEquals(sw.getCigar().toString(), expectedCigar);
}
@Test(enabled = !DEBUG)
public void testAlignReallyLongDeletion() {
final String ref = "CGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTGACCTCAGGTGATCCACTCGCCTCGGTCTCCCAAAGTGTTGGGATTACAGGCATGAACCACTGCACCTGGCCTAGTGTTTGGGAAAACTATACTAGGAAAAGAATAGTTGCTTTAAGTCATTCTTTGATTATTCTGAGAATTGGCATATAGCTGCCATTATAACCTACTTTTGCTAAATATAATAATAATAATCATTATTTTTATTTTTTGAGACAGGGTCTTGTTTTGTCACCCCGGCTGGAGTGAAGTGGCGCAATCTCGGCTCACTGCAACCTCCACCTCCGGGTGCAAGCAATTCTCCTGCCTCAGCCTCTTGAGTAGCTAGGATTACAGGCACAAGCCATCATGCCCAGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAGGT";
final String alt = "CGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAGGT";
final GlobalEdgeGreedySWPairwiseAlignment sw = new GlobalEdgeGreedySWPairwiseAlignment(ref.getBytes(), alt.getBytes(), SWParameterSet.STANDARD_NGS);
Assert.assertEquals(sw.getAlignmentStart2wrt1(), 0);
Assert.assertEquals(sw.getCigar().toString(), "47M419D31M");
}
public static final Parameters params = new Parameters(20.0, -10.0, -26.0, -1.1);
@DataProvider(name = "SWData")
public Object[][] makeSWData() {
List<Object[]> tests = new ArrayList<Object[]>();
// simple cases
tests.add(new Object[]{"A", "C", "1M"});
tests.add(new Object[]{"AAA", "AAA", "3M"});
tests.add(new Object[]{"AAA", "AGA", "3M"});
tests.add(new Object[]{"AAA", "GAA", "3M"});
tests.add(new Object[]{"AAA", "AAG", "3M"});
// small single indels
tests.add(new Object[]{"ACACACAC", "ACACAC", "6M2D"});
tests.add(new Object[]{"ACACAC", "ACACACAC", "6M2I"});
tests.add(new Object[]{"XXACACACXX", "XXACACACACXX", "8M2I2M"});
tests.add(new Object[]{"XXACACACXX", "XXACACXX", "6M2D2M"});
tests.add(new Object[]{"ACGT", "AACGT", "1I4M"});
tests.add(new Object[]{"ACGT", "ACCGT", "2M1I2M"});
tests.add(new Object[]{"ACGT", "ACGGT", "3M1I1M"});
tests.add(new Object[]{"ACGT", "ACGTT", "4M1I"});
tests.add(new Object[]{"ACGT", "CGT", "1D3M"});
tests.add(new Object[]{"ACGT", "AGT", "1M1D2M"});
tests.add(new Object[]{"ACGT", "ACT", "2M1D1M"});
tests.add(new Object[]{"ACGT", "ACG", "3M1D"});
// mismatches through out the sequences
final String ref = "ACGTAACCGGTT";
for ( int diff = 0; diff < ref.length(); diff++ ) {
final byte[] altBases = ref.getBytes();
altBases[diff] = 'N';
tests.add(new Object[]{ref, new String(altBases), ref.length() + "M"});
}
for ( int diff1 = 0; diff1 < ref.length(); diff1++ ) {
for ( int diff2 = 0; diff2 < ref.length(); diff2++ ) {
final byte[] altBases = ref.getBytes();
altBases[diff1] = 'N';
altBases[diff2] = 'N';
tests.add(new Object[]{ref, new String(altBases), ref.length() + "M"});
}
}
// prefixes and suffixes matching
final String totalPrefix = "ACG";
final String totalSuffix = "GCT";
for ( int prefixSize = 0; prefixSize < totalPrefix.length(); prefixSize++) {
for ( int suffixSize = 0; suffixSize < totalPrefix.length(); suffixSize++) {
if ( prefixSize + suffixSize == 0 )
continue;
for ( int indelSize = 1; indelSize < 50; indelSize++ ) {
final String prefix = totalPrefix.substring(0, prefixSize);
final String suffix = totalSuffix.substring(0, suffixSize);
final String insert = Utils.dupString("N", indelSize);
tests.add(new Object[]{prefix + suffix, prefix + insert + suffix, prefix.length() + "M" + indelSize + "I" + suffix.length() + "M"});
tests.add(new Object[]{prefix + insert + suffix, prefix + suffix, prefix.length() + "M" + indelSize + "D" + suffix.length() + "M"});
}
}
}
// larger indels with prefixes/suffixes
tests.add(new Object[]{"ACTGTTTTGAACATCAGTTATTTTAAACTTTTAAGTTGTTAGCACAGCAAAAGCAACAAAATTCTAAGTGCAGTAATCACTTTACTGCGTGGTCATATGAAATCAAGGCAATGTTATGAGTATTACTGGAAAGCTGGACAGAGTAACGGGAAAAGTGACTAAAACTATGC", "CCTGTTTTGAACATCAGTTATTTTAAACTTTTAAGTTGTTAGCACAGCAAAAGCAACAAAATTCTAAGTGCAGTAATCACTTTACTGCGTGGTCATATGAAATCAAGGCAATGTTATGAGTATTACTGGAAAGCTGGACAGAGTAACGGGAAAAGTGACT", "160M10D"});
tests.add(new Object[]{"LLLLLTATTAAGTAGTGCTCTATGTTGTCAACTAATTTATTTCCCATTTCAAACATTAGTTGACATGTTTTCATTTCTCTTTTGGAAGGAAACAACTAAATATGTTATCAATCCATCATTTACTTGTACAATAAATAAAGTTCTAAATCACTGCACAGTGTAAAATGGCAAATAGACTTCCCCATAACACAAAGCCATCCTGAAAAGTTTTGTTCATTTTAGAAGRRRRR", "LLLLLARRRRR", "5M219D6M"});
tests.add(new Object[]{"LLLLLTATTTTTTRRRRR", "LLLLLARRRRR", "5M7D6M"});
// systematic testing
for ( final int forwardMatches : Arrays.asList(0, 1, 5, 10)) {
for ( final int forwardMismatches : Arrays.asList(0, 1, 2)) {
for ( final int middleMatches : Arrays.asList(0, 1, 5, 10)) {
for ( final int delSize : Arrays.asList(0, 1, 2, 3 )) {
for ( final int insSize : Arrays.asList(0, 1, 2, 3 )) {
for ( final int reverseMismatches : Arrays.asList(0, 1, 2)) {
for ( final int reverseMatches : Arrays.asList(0, 1, 5, 10)) {
// if there is an insertion and deletion, they should cancel each other out (at least partially)
final int overlap = Math.min(delSize, insSize);
final int myDelSize = delSize - overlap;
final int myInsSize = insSize - overlap;
// this case is too difficult to create a CIGAR for because SW will (legitimately) prefer to switch the indel and mismatches
final int totalMismatches = forwardMismatches + reverseMismatches;
if ( (myDelSize > 0 || myInsSize > 0 ) && (totalMismatches >= myDelSize || totalMismatches >= myInsSize) )
continue;
final StringBuilder refBuilder = new StringBuilder();
final StringBuilder altBuilder = new StringBuilder();
final StringBuilder cigarBuilder = new StringBuilder();
refBuilder.append(Utils.dupString('A', forwardMatches + forwardMismatches + middleMatches));
altBuilder.append(Utils.dupString('A', forwardMatches));
altBuilder.append(Utils.dupString('C', forwardMismatches));
altBuilder.append(Utils.dupString('A', middleMatches));
cigarBuilder.append(forwardMatches + forwardMismatches + middleMatches);
cigarBuilder.append("M");
if ( myDelSize > 0 ) {
refBuilder.append(Utils.dupString('G', myDelSize));
cigarBuilder.append(myDelSize);
cigarBuilder.append("D");
}
if ( myInsSize > 0 ) {
altBuilder.append(Utils.dupString('T', myInsSize));
cigarBuilder.append(myInsSize);
cigarBuilder.append("I");
}
if ( overlap > 0 ) {
refBuilder.append(Utils.dupString('G', overlap));
altBuilder.append(Utils.dupString('T', overlap));
cigarBuilder.append(overlap);
cigarBuilder.append("M");
}
if ( delSize > 0 || insSize > 0 ) {
refBuilder.append(Utils.dupString('A', middleMatches));
altBuilder.append(Utils.dupString('A', middleMatches));
cigarBuilder.append(middleMatches);
cigarBuilder.append("M");
}
refBuilder.append(Utils.dupString('A', reverseMismatches + reverseMatches));
altBuilder.append(Utils.dupString('C', reverseMismatches));
altBuilder.append(Utils.dupString('A', reverseMatches));
cigarBuilder.append(reverseMismatches + reverseMatches);
cigarBuilder.append("M");
if ( refBuilder.length() > 0 && altBuilder.length() > 0 )
tests.add(new Object[]{refBuilder.toString(), altBuilder.toString(), cigarBuilder.toString()});
}
}
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "SWData", enabled = !DEBUG)
public void testSW(final String seq1, final String seq2, final String expectedCigar) {
final GlobalEdgeGreedySWPairwiseAlignment alignment = new GlobalEdgeGreedySWPairwiseAlignment(seq1.getBytes(), seq2.getBytes(), new Parameters(5.0, -5.0, -25.0, -1.0));
Assert.assertEquals(alignment.getCigar(), AlignmentUtils.consolidateCigar(TextCigarCodec.getSingleton().decode(expectedCigar)));
}
/**
* For debugging purposes only
*/
@Test(enabled = DEBUG)
public void testDebugging() {
final String ref = "A";
final String alt = "C";
final GlobalEdgeGreedySWPairwiseAlignment sw = new GlobalEdgeGreedySWPairwiseAlignment(ref.getBytes(), alt.getBytes(), new Parameters(5.0, -5.0, -25.0, -1.0));
Assert.assertEquals(sw.getCigar().toString(), "1M");
}
}

View File

@ -3,6 +3,7 @@ library(ggplot2)
library(gplots)
library(tools)
library(reshape)
library(plyr)
#
# Standard command line switch. Can we loaded interactively for development
@ -14,7 +15,7 @@ if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
inputFileName = "Q-26618@gsa4.jobreport.txt"
inputFileName = "~/Desktop/broadLocal/projects/pipelinePerformance/FullProcessingPipeline.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
outputPDF = NA
@ -35,13 +36,11 @@ allJobsFromReport <- function(report) {
#
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
#
plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
plotJobsGantt <- function(gatkReport, sortOverall, title, includeText) {
allJobs = allJobsFromReport(gatkReport)
if ( sortOverall ) {
title = "All jobs, by analysis, by start time"
allJobs = allJobs[order(allJobs$analysisName, allJobs$startTime, decreasing=T), ]
} else {
title = "All jobs, sorted by start time"
allJobs = allJobs[order(allJobs$startTime, decreasing=T), ]
}
allJobs$index = 1:nrow(allJobs)
@ -54,11 +53,11 @@ plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
p <- p + theme_bw()
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
if ( includeText )
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + xlim(0, maxRelTime * 1.1)
p <- p + geom_text(aes(x=relStartTime, label=ganttName, hjust=0, vjust=-1), size=2)
p <- p + xlim(0, maxRelTime * 1.3)
p <- p + xlab(paste("Start time, relative to first job", RUNTIME_UNITS))
p <- p + ylab("Job number")
p <- p + opts(title=title)
p <- p + ggtitle(title)
print(p)
}
@ -182,6 +181,27 @@ plotTimeByHost <- function(gatkReportData) {
plotMe("Jittered points", geom_jitter)
}
mergeScattersForAnalysis <- function(table) {
#allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
ddply(table, .(analysisName, iteration), summarize,
jobName = analysisName[1],
exechosts = paste(length(exechosts), "hosts"),
formattedStartTime = "NA",
formattedDoneTime = "NA",
intermediate = intermediate[1],
startTime = min(startTime),
doneTime = min(startTime) + sum(runtime),
runtime = sum(runtime))
}
mergeScatters <- function(report) {
newReport = list()
for ( name in names(gatkReportData) ) {
newReport[[name]] = mergeScattersForAnalysis(gatkReportData[[name]])
}
newReport
}
# read the table
gatkReportData <- gsa.read.gatkreport(inputFileName)
@ -192,13 +212,24 @@ if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
plotJobsGantt(gatkReportData, T, F)
plotJobsGantt(gatkReportData, F, F)
plotJobsGantt(gatkReportData, T, "All jobs, by analysis, by start time", F)
plotJobsGantt(gatkReportData, F, "All jobs, sorted by start time", F)
plotProgressByTime(gatkReportData)
# plots summarizing overall costs, merging scattered counts
merged.by.scatter = mergeScatters(gatkReportData)
plotJobsGantt(merged.by.scatter, F, "Jobs merged by scatter by start time", T)
merged.as.df = do.call(rbind.data.frame, merged.by.scatter)[,c("analysisName", "runtime")]
merged.as.df$percent = merged.as.df$runtime / sum(merged.as.df$runtime) * 100
merged.as.df.formatted = data.frame(analysisName=merged.as.df$analysisName,runtime=prettyNum(merged.as.df$runtime), percent=prettyNum(merged.as.df$percent,digits=2))
textplot(merged.as.df.formatted[order(merged.as.df$runtime),], show.rownames=F)
title("Total runtime for each analysis")
plotTimeByHost(gatkReportData)
for ( group in gatkReportData ) {
print(group)
plotGroup(group)
#print(group)
plotGroup(group)
}
if ( ! is.na(outputPDF) ) {

View File

@ -570,9 +570,9 @@ public class GenomeAnalysisEngine {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer());
return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer());
return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer());
}
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
// Apply special validation to read pair walkers.

View File

@ -0,0 +1,85 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/**
* ActiveRegionShardBalancer
*
* Merges all of the file pointer information for a single contig index into a single
* combined shard. The purpose of doing this is to ensure that the HaplotypeCaller, which
* doesn't support TreeReduction by construction, gets all of the data on a single
* contig together so the the NanoSchedule runs efficiently
*/
public class ActiveRegionShardBalancer extends ShardBalancer {
/**
* Convert iterators of file pointers into balanced iterators of shards.
* @return An iterator over balanced shards.
*/
public Iterator<Shard> iterator() {
return new Iterator<Shard>() {
public boolean hasNext() {
return filePointers.hasNext();
}
public Shard next() {
FilePointer current = getCombinedFilePointersOnSingleContig();
// FilePointers have already been combined as necessary at the IntervalSharder level. No
// need to do so again here.
return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
}
};
}
/**
* Combine all of the file pointers in the filePointers iterator into a single combined
* FilePointer that spans all of the file pointers on a single contig
* @return a non-null FilePointer
*/
private FilePointer getCombinedFilePointersOnSingleContig() {
FilePointer current = filePointers.next();
final List<FilePointer> toCombine = new LinkedList<>();
toCombine.add(current);
while ( filePointers.hasNext() &&
current.isRegionUnmapped == filePointers.peek().isRegionUnmapped &&
(current.getContigIndex() == filePointers.peek().getContigIndex() || current.isRegionUnmapped) ) {
toCombine.add(filePointers.next());
}
return FilePointer.union(toCombine, parser);
}
}

View File

@ -407,10 +407,10 @@ public class FilePointer {
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("FilePointer:%n");
builder.append("FilePointer:\n");
builder.append("\tlocations = {");
builder.append(Utils.join(";",locations));
builder.append("}%n\tregions = %n");
builder.append("}\n\tregions = \n");
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: fileSpans.entrySet()) {
builder.append(entry.getKey());
builder.append("= {");

View File

@ -245,7 +245,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
} else if (walker instanceof ReadPairWalker) {
return new TraverseReadPairs();
} else if (walker instanceof ActiveRegionWalker) {
return new TraverseActiveRegions();
return new TraverseActiveRegions(threadAllocation.getNumCPUThreadsPerDataThread());
} else {
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
}

View File

@ -41,12 +41,22 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.activeregion.*;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
import org.broadinstitute.sting.utils.activeregion.BandPassActivityProfile;
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
import java.util.*;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/**
* Implement active region traversal
@ -67,7 +77,8 @@ import java.util.*;
* variable spanOfLastReadSeen
*
*/
public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
private final static boolean DEBUG = false;
protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
protected final static boolean LOG_READ_CARRYING = false;
@ -84,7 +95,32 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
private GenomeLoc spanOfLastReadSeen = null;
private ActivityProfile activityProfile = null;
int maxReadsInMemory = 0;
ActiveRegionWalker walker;
ActiveRegionWalker<M, T> walker;
final NanoScheduler<ActiveRegion, M, T> nanoScheduler;
/**
* Create a single threaded active region traverser
*/
public TraverseActiveRegions() {
this(1);
}
/**
* Create an active region traverser that uses nThreads for getting its work done
* @param nThreads number of threads
*/
public TraverseActiveRegions(final int nThreads) {
nanoScheduler = new NanoScheduler<>(nThreads);
nanoScheduler.setProgressFunction(new NSProgressFunction<ActiveRegion>() {
@Override
public void progress(ActiveRegion lastActiveRegion) {
if ( lastActiveRegion != null )
// note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon
printProgress(lastActiveRegion.getLocation().getStopLocation());
}
});
}
/**
* Have the debugging output streams been initialized already?
@ -98,7 +134,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) {
super.initialize(engine, walker, progressMeter);
this.walker = (ActiveRegionWalker)walker;
this.walker = (ActiveRegionWalker<M,T>)walker;
if ( this.walker.wantsExtendedReads() && ! this.walker.wantsNonPrimaryReads() ) {
throw new IllegalArgumentException("Active region walker " + this.walker + " requested extended events but not " +
"non-primary reads, an inconsistent state. Please modify the walker");
@ -217,58 +253,108 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
if ( LOG_READ_CARRYING || logger.isDebugEnabled() )
logger.info(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
final LocusView locusView = new AllLocusView(dataProvider);
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// We keep processing while the next reference location is within the interval
final GenomeLoc locOfLastReadAtTraversalStart = spanOfLastSeenRead();
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
rememberLastLocusLocation(location);
// get all of the new reads that appear in the current pileup, and them to our list of reads
// provided we haven't seen them before
final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
for( final GATKSAMRecord read : reads ) {
if ( ! appearedInLastShard(locOfLastReadAtTraversalStart, read) ) {
rememberLastReadLocation(read);
myReads.add(read);
}
}
// skip this location -- it's not part of our engine intervals
if ( outsideEngineIntervals(location) )
continue;
// we've move across some interval boundary, restart profile
final boolean flushProfile = ! activityProfile.isEmpty()
&& ( activityProfile.getContigIndex() != location.getContigIndex()
|| location.getStart() != activityProfile.getStop() + 1);
sum = processActiveRegions(walker, sum, flushProfile, false);
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
addIsActiveResult(walker, tracker, refContext, locus);
maxReadsInMemory = Math.max(myReads.size(), maxReadsInMemory);
printProgress(location);
}
nanoScheduler.setDebug(false);
final Iterator<ActiveRegion> activeRegionIterator = new ActiveRegionIterator(dataProvider);
final TraverseActiveRegionMap myMap = new TraverseActiveRegionMap();
final TraverseActiveRegionReduce myReduce = new TraverseActiveRegionReduce();
final T result = nanoScheduler.execute(activeRegionIterator, myMap, sum, myReduce);
updateCumulativeMetrics(dataProvider.getShard());
return sum;
return result;
}
private class ActiveRegionIterator implements Iterator<ActiveRegion> {
private final LocusShardDataProvider dataProvider;
private LinkedList<ActiveRegion> readyActiveRegions = new LinkedList<ActiveRegion>();
private boolean done = false;
private final LocusView locusView;
private final LocusReferenceView referenceView;
private final ReferenceOrderedView referenceOrderedDataView;
private final GenomeLoc locOfLastReadAtTraversalStart;
public ActiveRegionIterator( final LocusShardDataProvider dataProvider ) {
this.dataProvider = dataProvider;
locusView = new AllLocusView(dataProvider);
referenceView = new LocusReferenceView( walker, dataProvider );
referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// We keep processing while the next reference location is within the interval
locOfLastReadAtTraversalStart = spanOfLastSeenRead();
}
@Override public void remove() { throw new UnsupportedOperationException("Cannot remove from ActiveRegionIterator"); }
@Override
public ActiveRegion next() {
return readyActiveRegions.pop();
}
@Override
public boolean hasNext() {
if ( ! readyActiveRegions.isEmpty() )
return true;
if ( done )
return false;
else {
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
rememberLastLocusLocation(location);
// get all of the new reads that appear in the current pileup, and them to our list of reads
// provided we haven't seen them before
final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
for( final GATKSAMRecord read : reads ) {
// note that ActiveRegionShards span entire contigs, so this check is in some
// sense no longer necessary, as any read that appeared in the last shard would now
// by definition be on a different contig. However, the logic here doesn't hurt anything
// and makes us robust should we decided to provide shards that don't fully span
// contigs at some point in the future
if ( ! appearedInLastShard(locOfLastReadAtTraversalStart, read) ) {
rememberLastReadLocation(read);
myReads.add(read);
}
}
// skip this location -- it's not part of our engine intervals
if ( outsideEngineIntervals(location) )
continue;
// we've move across some interval boundary, restart profile
final boolean flushProfile = ! activityProfile.isEmpty()
&& ( activityProfile.getContigIndex() != location.getContigIndex()
|| location.getStart() != activityProfile.getStop() + 1);
final List<ActiveRegion> newActiveRegions = prepActiveRegionsForProcessing(walker, flushProfile, false);
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
addIsActiveResult(walker, tracker, refContext, locus);
maxReadsInMemory = Math.max(myReads.size(), maxReadsInMemory);
printProgress(location);
if ( ! newActiveRegions.isEmpty() ) {
readyActiveRegions.addAll(newActiveRegions);
if ( DEBUG )
for ( final ActiveRegion region : newActiveRegions )
logger.info("Adding region to queue for processing " + region);
return true;
}
}
return false;
}
}
}
/**
@ -276,7 +362,11 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public T endTraversal(final Walker<M, T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true, true);
for ( final ActiveRegion region : prepActiveRegionsForProcessing((ActiveRegionWalker<M, T>)walker, true, true) ) {
final M x = ((ActiveRegionWalker<M, T>) walker).map(region, null);
sum = walker.reduce( x, sum );
}
return sum;
}
// -------------------------------------------------------------------------------------
@ -438,7 +528,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
// note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1
out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName);
for ( final double value : values )
out.print(String.format("\t%.3f", value));
out.print(String.format("\t%.5f", value));
out.println();
}
@ -504,7 +594,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
* add these blocks of work to the work queue
* band-pass filter the list of isActive probabilities and turn into active regions
*/
private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean flushActivityProfile, final boolean forceAllRegionsToBeActive) {
private List<ActiveRegion> prepActiveRegionsForProcessing(final ActiveRegionWalker<M, T> walker, final boolean flushActivityProfile, final boolean forceAllRegionsToBeActive) {
if ( ! walkerHasPresetRegions ) {
// We don't have preset regions, so we get our regions from the activity profile
final Collection<ActiveRegion> activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMinRegionSize(), getMaxRegionSize(), flushActivityProfile);
@ -513,21 +603,23 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
}
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
final LinkedList<ActiveRegion> readyRegions = new LinkedList<ActiveRegion>();
while( workQueue.peek() != null ) {
final ActiveRegion activeRegion = workQueue.peek();
if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
writeActivityProfile(activeRegion.getSupportingStates());
writeActiveRegion(activeRegion);
sum = processActiveRegion( workQueue.remove(), sum, walker );
readyRegions.add(prepActiveRegionForProcessing(workQueue.remove(), walker));
} else {
break;
}
}
return sum;
return readyRegions;
}
private T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
private ActiveRegion prepActiveRegionForProcessing(final ActiveRegion activeRegion, final ActiveRegionWalker<M, T> walker) {
final List<GATKSAMRecord> stillLive = new LinkedList<GATKSAMRecord>();
for ( final GATKSAMRecord read : myReads.popCurrentReads() ) {
boolean killed = false;
@ -561,7 +653,21 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s",
activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive(), activeRegion.size(), maxReadsInMemory));
final M x = walker.map(activeRegion, null);
return walker.reduce( x, sum );
return activeRegion;
}
private class TraverseActiveRegionMap implements NSMapFunction<ActiveRegion, M> {
@Override
public M apply(final ActiveRegion activeRegion) {
if ( DEBUG ) logger.info("Executing walker.map for " + activeRegion + " in thread " + Thread.currentThread().getName());
return walker.map(activeRegion, null);
}
}
private class TraverseActiveRegionReduce implements NSReduceFunction<M, T> {
@Override
public T apply(M one, T sum) {
return walker.reduce(one, sum);
}
}
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMProgramRecord;
import org.apache.log4j.Logger;
@ -748,4 +749,60 @@ public class Utils {
if ( suffix == null ) throw new IllegalArgumentException("suffix cannot be null");
return new String(big).endsWith(new String(suffix));
}
/**
* Get the length of the longest common prefix of seq1 and seq2
* @param seq1 non-null byte array
* @param seq2 non-null byte array
* @param maxLength the maximum allowed length to return
* @return the length of the longest common prefix of seq1 and seq2, >= 0
*/
public static int longestCommonPrefix(final byte[] seq1, final byte[] seq2, final int maxLength) {
if ( seq1 == null ) throw new IllegalArgumentException("seq1 is null");
if ( seq2 == null ) throw new IllegalArgumentException("seq2 is null");
if ( maxLength < 0 ) throw new IllegalArgumentException("maxLength < 0 " + maxLength);
final int end = Math.min(seq1.length, Math.min(seq2.length, maxLength));
for ( int i = 0; i < end; i++ ) {
if ( seq1[i] != seq2[i] )
return i;
}
return end;
}
/**
* Get the length of the longest common suffix of seq1 and seq2
* @param seq1 non-null byte array
* @param seq2 non-null byte array
* @param maxLength the maximum allowed length to return
* @return the length of the longest common suffix of seq1 and seq2, >= 0
*/
public static int longestCommonSuffix(final byte[] seq1, final byte[] seq2, final int maxLength) {
if ( seq1 == null ) throw new IllegalArgumentException("seq1 is null");
if ( seq2 == null ) throw new IllegalArgumentException("seq2 is null");
if ( maxLength < 0 ) throw new IllegalArgumentException("maxLength < 0 " + maxLength);
final int end = Math.min(seq1.length, Math.min(seq2.length, maxLength));
for ( int i = 0; i < end; i++ ) {
if ( seq1[seq1.length - i - 1] != seq2[seq2.length - i - 1] )
return i;
}
return end;
}
/**
* Trim any number of bases from the front and/or back of an array
*
* @param seq the sequence to trim
* @param trimFromFront how much to trim from the front
* @param trimFromBack how much to trim from the back
* @return a non-null array; can be the original array (i.e. not a copy)
*/
public static byte[] trimArray(final byte[] seq, final int trimFromFront, final int trimFromBack) {
if ( trimFromFront + trimFromBack > seq.length )
throw new IllegalArgumentException("trimming total is larger than the original array");
// don't perform array copies if we need to copy everything anyways
return ( trimFromFront == 0 && trimFromBack == 0 ) ? seq : Arrays.copyOfRange(seq, trimFromFront, seq.length - trimFromBack);
}
}

View File

@ -248,4 +248,62 @@ public class JVMUtils {
interfaces.add(interfaceClass.getSimpleName());
return Utils.join(", ", interfaces);
}
/**
* Returns the Class that invoked the specified "callee" class by examining the runtime stack.
* The calling class is defined as the first class below the callee class on the stack.
*
* For example, given callee == MyClass and the following runtime stack:
*
* JVMUtils.getCallingClass(MyClass) <-- top
* MyClass.foo()
* MyClass.bar()
* OtherClass.foo()
* OtherClass.bar()
* etc.
*
* this method would return OtherClass, since its methods invoked the methods in MyClass.
*
* Considers only the occurrence of the callee class on the stack that is closest to the top
* (even if there are multiple, non-contiguous occurrences).
*
* @param callee Class object for the class whose calling class we want to locate
* @return Class object for the class that invoked the callee class, or null if
* no calling class was found
* @throws IllegalArgumentException if the callee class is not found on the runtime stack
* @throws IllegalStateException if we get an error while trying to load the Class object for the calling
* class reported on the runtime stack
*/
public static Class getCallingClass( final Class callee ) {
final StackTraceElement[] stackTrace = new Throwable().getStackTrace();
final String calleeClassName = callee.getName();
// Start examining the stack at the second-from-the-top position, to remove
// this method call (ie., the call to getCallingClass() itself) from consideration.
int stackTraceIndex = 1;
// Find the first occurrence of the callee on the runtime stack. Need to use String comparison
// unfortunately, due to limitations of the StackTraceElement class.
while ( stackTraceIndex < stackTrace.length && ! stackTrace[stackTraceIndex].getClassName().equals(calleeClassName) ) {
stackTraceIndex++;
}
// Make sure we actually found the callee class on the stack
if ( stackTraceIndex == stackTrace.length ) {
throw new IllegalArgumentException(String.format("Specified callee %s is not present on the call stack", callee.getSimpleName()));
}
// Now find the caller class, which will be the class below the callee on the stack
while ( stackTraceIndex < stackTrace.length && stackTrace[stackTraceIndex].getClassName().equals(calleeClassName) ) {
stackTraceIndex++;
}
try {
return stackTraceIndex < stackTrace.length ? Class.forName(stackTrace[stackTraceIndex].getClassName()) : null;
}
catch ( ClassNotFoundException e ) {
throw new IllegalStateException(String.format("Could not find caller class %s from the runtime stack in the classpath",
stackTrace[stackTraceIndex].getClassName()));
}
}
}

View File

@ -0,0 +1,217 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.smithwaterman;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import java.util.*;
/**
* Pairwise discrete Smith-Waterman alignment with an edge greedy implementation
*
* ************************************************************************
* **** IMPORTANT NOTE: ****
* **** This class assumes that all bytes come from UPPERCASED chars! ****
* ************************************************************************
*
* User: ebanks
*/
public final class GlobalEdgeGreedySWPairwiseAlignment extends SWPairwiseAlignment {
private final static boolean DEBUG_MODE = false;
/**
* Create a new greedy SW pairwise aligner
*
* @param reference the reference sequence we want to align
* @param alternate the alternate sequence we want to align
* @param parameters the SW parameters to use
*/
public GlobalEdgeGreedySWPairwiseAlignment(final byte[] reference, final byte[] alternate, final Parameters parameters) {
super(reference, alternate, parameters);
}
/**
* Create a new SW pairwise aligner
*
* After creating the object the two sequences are aligned with an internal call to align(seq1, seq2)
*
* @param reference the reference sequence we want to align
* @param alternate the alternate sequence we want to align
* @param namedParameters the named parameter set to get our parameters from
*/
public GlobalEdgeGreedySWPairwiseAlignment(final byte[] reference, final byte[] alternate, final SWParameterSet namedParameters) {
this(reference, alternate, namedParameters.parameters);
}
/**
* @see #GlobalEdgeGreedySWPairwiseAlignment(byte[], byte[], SWParameterSet) with original default parameters
*/
public GlobalEdgeGreedySWPairwiseAlignment(byte[] reference, byte[] alternate) {
this(reference, alternate, SWParameterSet.ORIGINAL_DEFAULT);
}
/**
* Aligns the alternate sequence to the reference sequence
*
* @param reference ref sequence
* @param alternate alt sequence
*/
@Override
protected void align(final byte[] reference, final byte[] alternate) {
if ( reference == null || reference.length == 0 )
throw new IllegalArgumentException("Non-null, non-empty reference sequences are required for the Smith-Waterman calculation");
if ( alternate == null || alternate.length == 0 )
throw new IllegalArgumentException("Non-null, non-empty alternate sequences are required for the Smith-Waterman calculation");
final int forwardEdgeMatch = Utils.longestCommonPrefix(reference, alternate, Integer.MAX_VALUE);
// edge case: one sequence is a strict prefix of the other
if ( forwardEdgeMatch == reference.length || forwardEdgeMatch == alternate.length ) {
alignmentResult = new SWPairwiseAlignmentResult(makeCigarForStrictPrefixAndSuffix(reference, alternate, forwardEdgeMatch, 0), 0);
return;
}
int reverseEdgeMatch = Utils.longestCommonSuffix(reference, alternate, Integer.MAX_VALUE);
// edge case: one sequence is a strict suffix of the other
if ( reverseEdgeMatch == reference.length || reverseEdgeMatch == alternate.length ) {
alignmentResult = new SWPairwiseAlignmentResult(makeCigarForStrictPrefixAndSuffix(reference, alternate, 0, reverseEdgeMatch), 0);
return;
}
final int sizeOfRefToAlign = reference.length - forwardEdgeMatch - reverseEdgeMatch;
final int sizeOfAltToAlign = alternate.length - forwardEdgeMatch - reverseEdgeMatch;
// edge case: one sequence is a strict subset of the other accounting for both prefix and suffix
final int minSizeToAlign = Math.min(sizeOfRefToAlign, sizeOfAltToAlign);
if ( minSizeToAlign < 0 )
reverseEdgeMatch += minSizeToAlign;
if ( sizeOfRefToAlign <= 0 || sizeOfAltToAlign <= 0 ) {
alignmentResult = new SWPairwiseAlignmentResult(makeCigarForStrictPrefixAndSuffix(reference, alternate, forwardEdgeMatch, reverseEdgeMatch), 0);
return;
}
final byte[] refToAlign = Utils.trimArray(reference, forwardEdgeMatch, reverseEdgeMatch);
final byte[] altToAlign = Utils.trimArray(alternate, forwardEdgeMatch, reverseEdgeMatch);
final double[] sw = new double[(sizeOfRefToAlign+1)*(sizeOfAltToAlign+1)];
if ( keepScoringMatrix ) SW = sw;
final int[] btrack = new int[(sizeOfRefToAlign+1)*(sizeOfAltToAlign+1)];
calculateMatrix(refToAlign, altToAlign, sw, btrack, OVERHANG_STRATEGY.INDEL);
if ( DEBUG_MODE ) {
System.out.println(new String(refToAlign) + " vs. " + new String(altToAlign));
debugMatrix(sw, sizeOfRefToAlign+1, sizeOfAltToAlign+1);
System.out.println("----");
debugMatrix(btrack, sizeOfRefToAlign + 1, sizeOfAltToAlign + 1);
System.out.println();
}
alignmentResult = calculateCigar(forwardEdgeMatch, reverseEdgeMatch, sizeOfRefToAlign, sizeOfAltToAlign, sw, btrack);
}
private void debugMatrix(final double[] matrix, final int dim1, final int dim2) {
for ( int i = 0; i < dim1; i++ ) {
for ( int j = 0; j < dim2; j++ )
System.out.print(String.format("%.1f ", matrix[i * dim2 + j]));
System.out.println();
}
}
private void debugMatrix(final int[] matrix, final int dim1, final int dim2) {
for ( int i = 0; i < dim1; i++ ) {
for ( int j = 0; j < dim2; j++ )
System.out.print(matrix[i*dim2 + j] + " ");
System.out.println();
}
}
/**
* Creates a CIGAR for the case where the prefix/suffix match combination encompasses an entire sequence
*
* @param reference the reference sequence
* @param alternate the alternate sequence
* @param matchingPrefix the prefix match size
* @param matchingSuffix the suffix match size
* @return non-null CIGAR
*/
private Cigar makeCigarForStrictPrefixAndSuffix(final byte[] reference, final byte[] alternate, final int matchingPrefix, final int matchingSuffix) {
final List<CigarElement> result = new ArrayList<CigarElement>();
// edge case: no D or I element
if ( reference.length == alternate.length ) {
result.add(makeElement(State.MATCH, matchingPrefix + matchingSuffix));
} else {
// add the first M element
if ( matchingPrefix > 0 )
result.add(makeElement(State.MATCH, matchingPrefix));
// add the D or I element
if ( alternate.length > reference.length )
result.add(makeElement(State.INSERTION, alternate.length - reference.length));
else // if ( reference.length > alternate.length )
result.add(makeElement(State.DELETION, reference.length - alternate.length));
// add the last M element
if ( matchingSuffix > 0 )
result.add(makeElement(State.MATCH, matchingSuffix));
}
return new Cigar(result);
}
/**
* Calculates the CIGAR for the alignment from the back track matrix
*
* @param matchingPrefix the prefix match size
* @param matchingSuffix the suffix match size
* @param refLength length of the reference sequence
* @param altLength length of the alternate sequence
* @param sw the Smith-Waterman matrix to use
* @param btrack the back track matrix to use
* @return non-null SWPairwiseAlignmentResult object
*/
protected SWPairwiseAlignmentResult calculateCigar(final int matchingPrefix, final int matchingSuffix,
final int refLength, final int altLength,
final double[] sw, final int[] btrack) {
final SWPairwiseAlignmentResult SW_result = calculateCigar(refLength, altLength, sw, btrack, OVERHANG_STRATEGY.INDEL);
final LinkedList<CigarElement> lce = new LinkedList<CigarElement>(SW_result.cigar.getCigarElements());
if ( matchingPrefix > 0 )
lce.addFirst(makeElement(State.MATCH, matchingPrefix));
if ( matchingSuffix > 0 )
lce.addLast(makeElement(State.MATCH, matchingSuffix));
return new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), 0);
}
}

View File

@ -45,19 +45,43 @@ import java.util.*;
* Date: Mar 23, 2009
* Time: 1:54:54 PM
*/
public final class SWPairwiseAlignment {
private int alignment_offset; // offset of s2 w/respect to s1
private Cigar alignmentCigar;
public class SWPairwiseAlignment implements SmithWaterman {
private final Parameters parameters;
protected SWPairwiseAlignmentResult alignmentResult;
private static final int MSTATE = 0;
private static final int ISTATE = 1;
private static final int DSTATE = 2;
private static final int CLIP = 3;
protected final Parameters parameters;
/**
* The state of a trace step through the matrix
*/
protected enum State {
MATCH,
INSERTION,
DELETION,
CLIP
}
/**
* What strategy should we use when the best path does not start/end at the corners of the matrix?
*/
public enum OVERHANG_STRATEGY {
/*
* Add softclips for the overhangs
*/
SOFTCLIP,
/*
* Treat the overhangs as proper insertions/deletions
*/
INDEL,
/*
* Just ignore the overhangs
*/
IGNORE
}
protected static boolean cutoff = false;
private static boolean DO_SOFTCLIP = true;
protected OVERHANG_STRATEGY overhang_strategy = OVERHANG_STRATEGY.SOFTCLIP;
/**
* The SW scoring matrix, stored for debugging purposes if keepScoringMatrix is true
@ -90,10 +114,19 @@ public final class SWPairwiseAlignment {
* @param parameters the SW parameters to use
*/
public SWPairwiseAlignment(byte[] seq1, byte[] seq2, Parameters parameters) {
this.parameters = parameters;
this(parameters);
align(seq1,seq2);
}
/**
* Create a new SW pairwise aligner, without actually doing any alignment yet
*
* @param parameters the SW parameters to use
*/
protected SWPairwiseAlignment(Parameters parameters) {
this.parameters = parameters;
}
/**
* Create a new SW pairwise aligner
*
@ -111,41 +144,94 @@ public final class SWPairwiseAlignment {
this(seq1,seq2,SWParameterSet.ORIGINAL_DEFAULT);
}
public Cigar getCigar() { return alignmentCigar ; }
@Override
public Cigar getCigar() { return alignmentResult.cigar ; }
public int getAlignmentStart2wrt1() { return alignment_offset; }
@Override
public int getAlignmentStart2wrt1() { return alignmentResult.alignment_offset; }
public void align(final byte[] a, final byte[] b) {
final int n = a.length;
final int m = b.length;
/**
* Aligns the alternate sequence to the reference sequence
*
* @param reference ref sequence
* @param alternate alt sequence
*/
protected void align(final byte[] reference, final byte[] alternate) {
if ( reference == null || reference.length == 0 || alternate == null || alternate.length == 0 )
throw new IllegalArgumentException("Non-null, non-empty sequences are required for the Smith-Waterman calculation");
final int n = reference.length;
final int m = alternate.length;
double [] sw = new double[(n+1)*(m+1)];
if ( keepScoringMatrix ) SW = sw;
int [] btrack = new int[(n+1)*(m+1)];
calculateMatrix(a, b, sw, btrack);
calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions)
calculateMatrix(reference, alternate, sw, btrack);
alignmentResult = calculateCigar(n, m, sw, btrack, overhang_strategy); // length of the segment (continuous matches, insertions or deletions)
}
/**
* Calculates the SW matrices for the given sequences
*
* @param reference ref sequence
* @param alternate alt sequence
* @param sw the Smith-Waterman matrix to populate
* @param btrack the back track matrix to populate
*/
protected void calculateMatrix(final byte[] reference, final byte[] alternate, double[] sw, int[] btrack) {
calculateMatrix(reference, alternate, sw, btrack, overhang_strategy);
}
private void calculateMatrix(final byte[] a, final byte[] b, double [] sw, int [] btrack ) {
final int n = a.length+1;
final int m = b.length+1;
/**
* Calculates the SW matrices for the given sequences
*
* @param reference ref sequence
* @param alternate alt sequence
* @param sw the Smith-Waterman matrix to populate
* @param btrack the back track matrix to populate
* @param overhang_strategy the strategy to use for dealing with overhangs
*/
protected void calculateMatrix(final byte[] reference, final byte[] alternate, double[] sw, int[] btrack, final OVERHANG_STRATEGY overhang_strategy) {
if ( reference.length == 0 || alternate.length == 0 )
throw new IllegalArgumentException("Non-null, non-empty sequences are required for the Smith-Waterman calculation");
final int n = reference.length+1;
final int m = alternate.length+1;
//final double MATRIX_MIN_CUTOFF=-1e100; // never let matrix elements drop below this cutoff
final double MATRIX_MIN_CUTOFF; // never let matrix elements drop below this cutoff
if ( cutoff ) MATRIX_MIN_CUTOFF = 0.0;
else MATRIX_MIN_CUTOFF = -1e100;
double [] best_gap_v = new double[m+1];
Arrays.fill(best_gap_v,-1.0e40);
int [] gap_size_v = new int[m+1];
double [] best_gap_h = new double[n+1];
final double[] best_gap_v = new double[m+1];
Arrays.fill(best_gap_v, -1.0e40);
final int[] gap_size_v = new int[m+1];
final double[] best_gap_h = new double[n+1];
Arrays.fill(best_gap_h,-1.0e40);
int [] gap_size_h = new int[n+1];
final int[] gap_size_h = new int[n+1];
// we need to initialize the SW matrix with gap penalties if we want to keep track of indels at the edges of alignments
if ( overhang_strategy == OVERHANG_STRATEGY.INDEL ) {
// initialize the first row
sw[1] = parameters.w_open;
double currentValue = parameters.w_open;
for ( int i = 2; i < m; i++ ) {
currentValue += parameters.w_extend;
sw[i] = currentValue;
}
// initialize the first column
sw[m] = parameters.w_open;
currentValue = parameters.w_open;
for ( int i = 2; i < n; i++ ) {
currentValue += parameters.w_extend;
sw[i*m] = currentValue;
}
}
// build smith-waterman matrix and keep backtrack info:
for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop
byte a_base = a[i-1]; // letter in a at the current pos
byte a_base = reference[i-1]; // letter in a at the current pos
final int row_offset = row_offset_1 + m;
@ -157,10 +243,10 @@ public final class SWPairwiseAlignment {
// data_offset_1 is linearized offset of element [i-1][j-1]
final byte b_base = b[j-1]; // letter in b at the current pos
final byte b_base = alternate[j-1]; // letter in b at the current pos
// in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base);
double step_diag = sw[data_offset_1] + wd(a_base,b_base);
final double step_diag = sw[data_offset_1] + wd(a_base,b_base);
// optimized "traversal" of all the matrix cells above the current one (i.e. traversing
// all 'step down' events that would end in the current cell. The optimized code
@ -236,65 +322,92 @@ public final class SWPairwiseAlignment {
}
}
/*
* Class to store the result of calculating the CIGAR from the back track matrix
*/
protected final class SWPairwiseAlignmentResult {
public final Cigar cigar;
public final int alignment_offset;
public SWPairwiseAlignmentResult(final Cigar cigar, final int alignment_offset) {
this.cigar = cigar;
this.alignment_offset = alignment_offset;
}
}
private void calculateCigar(int n, int m, double [] sw, int [] btrack) {
/**
* Calculates the CIGAR for the alignment from the back track matrix
*
* @param refLength length of the reference sequence
* @param altLength length of the alternate sequence
* @param sw the Smith-Waterman matrix to use
* @param btrack the back track matrix to use
* @param overhang_strategy the strategy to use for dealing with overhangs
* @return non-null SWPairwiseAlignmentResult object
*/
protected SWPairwiseAlignmentResult calculateCigar(final int refLength, final int altLength, final double[] sw, final int[] btrack, final OVERHANG_STRATEGY overhang_strategy) {
// p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
int p1 = 0, p2 = 0;
double maxscore = Double.NEGATIVE_INFINITY; // sw scores are allowed to be negative
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1, data_offset = m+1+m ; i < n+1 ; i++, data_offset += (m+1) ) {
// data_offset is the offset of [i][m]
if ( sw[data_offset] >= maxscore ) {
p1 = i; p2 = m ; maxscore = sw[data_offset];
// if we want to consider overhangs as legitimate operators, then just start from the corner of the matrix
if ( overhang_strategy == OVERHANG_STRATEGY.INDEL ) {
p1 = refLength;
p2 = altLength;
} else {
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1, data_offset = altLength+1+altLength ; i < refLength+1 ; i++, data_offset += (altLength+1) ) {
// data_offset is the offset of [i][m]
if ( sw[data_offset] >= maxscore ) {
p1 = i; p2 = altLength ; maxscore = sw[data_offset];
}
}
}
for ( int j = 1, data_offset = n*(m+1)+1 ; j < m+1 ; j++, data_offset++ ) {
// data_offset is the offset of [n][j]
if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) {
p1 = n;
p2 = j ;
maxscore = sw[data_offset];
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
for ( int j = 1, data_offset = refLength*(altLength+1)+1 ; j < altLength+1 ; j++, data_offset++ ) {
// data_offset is the offset of [n][j]
if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(refLength-j) < Math.abs(p1 - p2)) {
p1 = refLength;
p2 = j ;
maxscore = sw[data_offset];
segment_length = altLength - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
}
List<CigarElement> lce = new ArrayList<CigarElement>(5);
if ( segment_length > 0 && DO_SOFTCLIP ) {
lce.add(makeElement(CLIP, segment_length));
if ( segment_length > 0 && overhang_strategy == OVERHANG_STRATEGY.SOFTCLIP ) {
lce.add(makeElement(State.CLIP, segment_length));
segment_length = 0;
}
// we will be placing all insertions and deletions into sequence b, so the states are named w/regard
// to that sequence
int state = MSTATE;
State state = State.MATCH;
int data_offset = p1*(m+1)+p2; // offset of element [p1][p2]
int data_offset = p1*(altLength+1)+p2; // offset of element [p1][p2]
do {
int btr = btrack[data_offset];
int new_state;
State new_state;
int step_length = 1;
if ( btr > 0 ) {
new_state = DSTATE;
new_state = State.DELETION;
step_length = btr;
} else if ( btr < 0 ) {
new_state = ISTATE;
new_state = State.INSERTION;
step_length = (-btr);
} else new_state = MSTATE; // and step_length =1, already set above
} else new_state = State.MATCH; // and step_length =1, already set above
// move to next best location in the sw matrix:
switch( new_state ) {
case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in the sw matrix
case ISTATE: data_offset -= step_length; p2 -= step_length; break; // move left
case DSTATE: data_offset -= (m+1)*step_length; p1 -= step_length; break; // move up
case MATCH: data_offset -= (altLength+2); p1--; p2--; break; // move back along the diag in the sw matrix
case INSERTION: data_offset -= step_length; p2 -= step_length; break; // move left
case DELETION: data_offset -= (altLength+1)*step_length; p1 -= step_length; break; // move up
}
// now let's see if the state actually changed:
@ -305,7 +418,7 @@ public final class SWPairwiseAlignment {
segment_length = step_length;
state = new_state;
}
// next condition is equivalent to while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2:
// next condition is equivalent to while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2:
} while ( p1 > 0 && p2 > 0 );
// post-process the last segment we are still keeping;
@ -316,28 +429,41 @@ public final class SWPairwiseAlignment {
// last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M if
// DO_SOFTCLIP is false or 2S3M if DO_SOFTCLIP is true.
// The consumers need to check for the alignment offset and deal with it properly.
if (DO_SOFTCLIP ) {
final int alignment_offset;
if ( overhang_strategy == OVERHANG_STRATEGY.SOFTCLIP ) {
lce.add(makeElement(state, segment_length));
if ( p2> 0 ) lce.add(makeElement(CLIP, p2));
alignment_offset = p1 ;
} else {
if ( p2 > 0 ) lce.add(makeElement(State.CLIP, p2));
alignment_offset = p1;
} else if ( overhang_strategy == OVERHANG_STRATEGY.IGNORE ) {
lce.add(makeElement(state, segment_length + p2));
alignment_offset = p1 - p2;
} else { // overhang_strategy == OVERHANG_STRATEGY.INDEL
// take care of the actual alignment
lce.add(makeElement(state, segment_length));
// take care of overhangs at the beginning of the alignment
if ( p1 > 0 )
lce.add(makeElement(State.DELETION, p1));
else if ( p2 > 0 )
lce.add(makeElement(State.INSERTION, p2));
alignment_offset = 0;
}
Collections.reverse(lce);
alignmentCigar = AlignmentUtils.consolidateCigar(new Cigar(lce));
return new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), alignment_offset);
}
private CigarElement makeElement(int state, int segment_length) {
CigarOperator o = null;
switch(state) {
case MSTATE: o = CigarOperator.M; break;
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
case CLIP: o = CigarOperator.S; break;
protected CigarElement makeElement(final State state, final int length) {
CigarOperator op = null;
switch (state) {
case MATCH: op = CigarOperator.M; break;
case INSERTION: op = CigarOperator.I; break;
case DELETION: op = CigarOperator.D; break;
case CLIP: op = CigarOperator.S; break;
}
return new CigarElement(segment_length,o);
return new CigarElement(length, op);
}
private double wd(byte x, byte y) {
@ -360,7 +486,7 @@ public final class SWPairwiseAlignment {
Cigar cigar = getCigar();
if ( ! DO_SOFTCLIP ) {
if ( overhang_strategy != OVERHANG_STRATEGY.SOFTCLIP ) {
// we need to go through all the hassle below only if we do not do softclipping;
// otherwise offset is never negative

View File

@ -0,0 +1,57 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.smithwaterman;
import net.sf.samtools.Cigar;
/**
* Generic interface for SmithWaterman calculations
*
* This interface allows clients to use a generic SmithWaterman variable, without propogating the specific
* implementation of SmithWaterman throughout their code:
*
* SmithWaterman sw = new SpecificSmithWatermanImplementation(ref, read, params)
* sw.getCigar()
* sw.getAlignmentStart2wrt1()
*
* User: depristo
* Date: 4/26/13
* Time: 8:24 AM
*/
public interface SmithWaterman {
/**
* Get the cigar string for the alignment of this SmithWaterman class
* @return a non-null cigar
*/
public Cigar getCigar();
/**
* Get the starting position of the read sequence in the reference sequence
* @return a positive integer >= 0
*/
public int getAlignmentStart2wrt1();
}

View File

@ -97,7 +97,12 @@ public class MD5DB {
if ( ! dir.exists() ) {
System.out.printf("##### Creating MD5 db %s%n", LOCAL_MD5_DB_DIR);
if ( ! dir.mkdir() ) {
throw new ReviewedStingException("Infrastructure failure: failed to create md5 directory " + LOCAL_MD5_DB_DIR);
// Need to check AGAIN whether the dir exists, because we might be doing multi-process parallelism
// within the same working directory, and another GATK instance may have come along and created the
// directory between the calls to exists() and mkdir() above.
if ( ! dir.exists() ) {
throw new ReviewedStingException("Infrastructure failure: failed to create md5 directory " + LOCAL_MD5_DB_DIR);
}
}
}
}
@ -203,98 +208,106 @@ public class MD5DB {
}
public static class MD5Match {
final String actualMD5, expectedMD5;
final String failMessage;
boolean failed;
public final String actualMD5, expectedMD5;
public final String failMessage;
public final String diffEngineOutput;
public final boolean failed;
public MD5Match(final String actualMD5, final String expectedMD5, final String failMessage, final boolean failed) {
public MD5Match(final String actualMD5, final String expectedMD5, final String failMessage, final String diffEngineOutput, final boolean failed) {
this.actualMD5 = actualMD5;
this.expectedMD5 = expectedMD5;
this.failMessage = failMessage;
this.diffEngineOutput = diffEngineOutput;
this.failed = failed;
}
}
/**
* Tests a file MD5 against an expected value, returning the MD5. NOTE: This function WILL throw an exception if the MD5s are different.
* @param name Name of the test.
* Tests a file MD5 against an expected value, returning an MD5Match object containing a description of the
* match or mismatch. In case of a mismatch, outputs a description of the mismatch to various log files/streams.
*
* NOTE: This function WILL NOT throw an exception if the MD5s are different.
*
* @param testName Name of the test.
* @param testClassName Name of the class that contains the test.
* @param resultsFile File to MD5.
* @param expectedMD5 Expected MD5 value.
* @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text.
* @return The calculated MD5.
* @return an MD5Match object containing a description of the match/mismatch. Will have its "failed" field set
* to true if there was a mismatch (unless we're using the "parameterize" argument)
*/
public MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) {
final String actualMD5 = testFileMD5(name, resultsFile, expectedMD5, parameterize);
String failMessage = null;
public MD5Match testFileMD5(final String testName, final String testClassName, final File resultsFile, final String expectedMD5, final boolean parameterize) {
final String actualMD5 = calculateFileMD5(resultsFile);
String diffEngineOutput = "";
String failMessage = "";
boolean failed = false;
// copy md5 to integrationtests
updateMD5Db(actualMD5, resultsFile);
if (parameterize || expectedMD5.equals("")) {
// Don't assert
} else if ( actualMD5.equals(expectedMD5) ) {
//BaseTest.log(String.format(" => %s PASSED (expected=%s)", name, expectedMD5));
} else {
BaseTest.log(String.format("PARAMETERIZATION: file %s has md5 = %s", resultsFile, actualMD5));
} else if ( ! expectedMD5.equals(actualMD5) ) {
failed = true;
failMessage = String.format("%s has mismatching MD5s: expected=%s observed=%s", name, expectedMD5, actualMD5);
failMessage = String.format("%s:%s has mismatching MD5s: expected=%s observed=%s", testClassName, testName, expectedMD5, actualMD5);
diffEngineOutput = logMD5MismatchAndGetDiffEngineOutput(testName, testClassName, expectedMD5, actualMD5);
}
return new MD5Match(actualMD5, expectedMD5, failMessage, failed);
return new MD5Match(actualMD5, expectedMD5, failMessage, diffEngineOutput, failed);
}
/**
* Tests a file MD5 against an expected value, returning the MD5. NOTE: This function WILL NOT throw an exception if the MD5s are different.
* @param name Name of the test.
* @param resultsFile File to MD5.
* @param expectedMD5 Expected MD5 value.
* @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text.
* @return The calculated MD5.
* Calculates the MD5 for the specified file and returns it as a String
*
* @param file file whose MD5 to calculate
* @return file's MD5 in String form
* @throws RuntimeException if the file could not be read
*/
public String testFileMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) {
public String calculateFileMD5( final File file ) {
try {
final String filemd5sum = Utils.calcMD5(getBytesFromFile(resultsFile));
//
// copy md5 to integrationtests
//
updateMD5Db(filemd5sum, resultsFile);
if (parameterize || expectedMD5.equals("")) {
BaseTest.log(String.format("PARAMETERIZATION: file %s has md5 = %s", resultsFile, filemd5sum));
} else {
//System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5));
//System.out.flush();
if ( ! expectedMD5.equals(filemd5sum) ) {
// we are going to fail for real in assertEquals (so we are counted by the testing framework).
// prepare ourselves for the comparison
System.out.printf("##### Test %s is going to fail #####%n", name);
String pathToExpectedMD5File = getMD5FilePath(expectedMD5, "[No DB file found]");
String pathToFileMD5File = getMD5FilePath(filemd5sum, "[No DB file found]");
BaseTest.log(String.format("expected %s", expectedMD5));
BaseTest.log(String.format("calculated %s", filemd5sum));
BaseTest.log(String.format("diff %s %s", pathToExpectedMD5File, pathToFileMD5File));
md5MismatchStream.printf("%s\t%s\t%s%n", expectedMD5, filemd5sum, name);
md5MismatchStream.flush();
// inline differences
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final PrintStream ps = new PrintStream(baos);
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false);
boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params);
if ( success ) {
final String content = baos.toString();
BaseTest.log(content);
System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n",
pathToExpectedMD5File, pathToFileMD5File);
}
ps.close();
}
}
return filemd5sum;
} catch (Exception e) {
throw new RuntimeException("Failed to read bytes from calls file: " + resultsFile, e);
return Utils.calcMD5(getBytesFromFile(file));
}
catch ( Exception e ) {
throw new RuntimeException("Failed to read bytes from file: " + file + " for MD5 calculation", e);
}
}
/**
* Logs a description (including diff engine output) of the MD5 mismatch between the expectedMD5
* and actualMD5 to a combination of BaseTest.log(), the md5MismatchStream, and stdout, then returns
* the diff engine output.
*
* @param testName name of the test that generated the mismatch
* @param testClassName name of the class containing the test that generated the mismatch
* @param expectedMD5 the MD5 we were expecting from this test
* @param actualMD5 the MD5 we actually calculated from the test output
* @return the diff engine output produced while logging the description of the mismatch
*/
private String logMD5MismatchAndGetDiffEngineOutput(final String testName, final String testClassName, final String expectedMD5, final String actualMD5) {
System.out.printf("##### Test %s:%s is going to fail #####%n", testClassName, testName);
String pathToExpectedMD5File = getMD5FilePath(expectedMD5, "[No DB file found]");
String pathToFileMD5File = getMD5FilePath(actualMD5, "[No DB file found]");
BaseTest.log(String.format("expected %s", expectedMD5));
BaseTest.log(String.format("calculated %s", actualMD5));
BaseTest.log(String.format("diff %s %s", pathToExpectedMD5File, pathToFileMD5File));
md5MismatchStream.printf("%s\t%s\t%s%n", expectedMD5, actualMD5, testName);
md5MismatchStream.flush();
// inline differences
String diffEngineOutput = "";
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final PrintStream ps = new PrintStream(baos);
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false);
boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params);
if ( success ) {
diffEngineOutput = baos.toString();
BaseTest.log(diffEngineOutput);
System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n",
pathToExpectedMD5File, pathToFileMD5File);
}
ps.close();
return diffEngineOutput;
}
}

View File

@ -35,29 +35,32 @@ import java.util.List;
* @since Date created
*/
public class MD5Mismatch extends Exception {
final List<String> actuals, expecteds;
final List<String> actuals, expecteds, diffEngineOutputs;
public MD5Mismatch(final String actual, final String expected) {
this(Collections.singletonList(actual), Collections.singletonList(expected));
public MD5Mismatch(final String actual, final String expected, final String diffEngineOutput) {
this(Collections.singletonList(actual), Collections.singletonList(expected), Collections.singletonList(diffEngineOutput));
}
public MD5Mismatch(final List<String> actuals, final List<String> expecteds) {
super(formatMessage(actuals, expecteds));
public MD5Mismatch(final List<String> actuals, final List<String> expecteds, final List<String> diffEngineOutputs) {
super(formatMessage(actuals, expecteds, diffEngineOutputs));
this.actuals = actuals;
this.expecteds = expecteds;
this.diffEngineOutputs = diffEngineOutputs;
}
@Override
public String toString() {
return formatMessage(actuals, expecteds);
return formatMessage(actuals, expecteds, diffEngineOutputs);
}
private final static String formatMessage(final List<String> actuals, final List<String> expecteds) {
private static String formatMessage(final List<String> actuals, final List<String> expecteds, final List<String> diffEngineOutputs) {
final StringBuilder b = new StringBuilder("MD5 mismatch: ");
for ( int i = 0; i < actuals.size(); i++ ) {
if ( i > 1 ) b.append("\t\t\n");
if ( i >= 1 ) b.append("\t\t\n\n");
b.append("actual ").append(actuals.get(i));
b.append(" expected ").append(expecteds.get(i));
b.append("\nDiff Engine Output:\n");
b.append(diffEngineOutputs.get(i));
}
return b.toString();
}

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.JVMUtils;
import org.broadinstitute.variant.bcf2.BCF2Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.variant.vcf.VCFCodec;
@ -73,10 +74,6 @@ public class WalkerTest extends BaseTest {
return md5DB;
}
public MD5DB.MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5) {
return getMd5DB().assertMatchingMD5(name, resultsFile, expectedMD5, parameterize());
}
public void validateOutputBCFIfPossible(final String name, final File resultFile) {
final File bcfFile = BCF2Utils.shadowBCF(resultFile);
if ( bcfFile != null && bcfFile.exists() ) {
@ -114,15 +111,15 @@ public class WalkerTest extends BaseTest {
}
}
public List<String> assertMatchingMD5s(final String name, List<File> resultFiles, List<String> expectedMD5s) {
public List<String> assertMatchingMD5s(final String testName, final String testClassName, List<File> resultFiles, List<String> expectedMD5s) {
List<String> md5s = new ArrayList<String>();
List<MD5DB.MD5Match> fails = new ArrayList<MD5DB.MD5Match>();
for (int i = 0; i < resultFiles.size(); i++) {
MD5DB.MD5Match result = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i));
validateOutputBCFIfPossible(name, resultFiles.get(i));
MD5DB.MD5Match result = getMd5DB().testFileMD5(testName, testClassName, resultFiles.get(i), expectedMD5s.get(i), parameterize());
validateOutputBCFIfPossible(testName, resultFiles.get(i));
if ( ! result.failed ) {
validateOutputIndex(name, resultFiles.get(i));
validateOutputIndex(testName, resultFiles.get(i));
md5s.add(result.expectedMD5);
} else {
fails.add(result);
@ -132,14 +129,17 @@ public class WalkerTest extends BaseTest {
if ( ! fails.isEmpty() ) {
List<String> actuals = new ArrayList<String>();
List<String> expecteds = new ArrayList<String>();
List<String> diffEngineOutputs = new ArrayList<String>();
for ( final MD5DB.MD5Match fail : fails ) {
actuals.add(fail.actualMD5);
expecteds.add(fail.expectedMD5);
diffEngineOutputs.add(fail.diffEngineOutput);
logger.warn("Fail: " + fail.failMessage);
}
final MD5Mismatch failure = new MD5Mismatch(actuals, expecteds);
Assert.fail(failure.toString(), failure);
final MD5Mismatch failure = new MD5Mismatch(actuals, expecteds, diffEngineOutputs);
Assert.fail(failure.toString());
}
return md5s;
@ -170,6 +170,9 @@ public class WalkerTest extends BaseTest {
boolean includeImplicitArgs = true;
boolean includeShadowBCF = true;
// Name of the test class that created this test case
private Class testClass;
// the default output path for the integration test
private File outputFileLocation = null;
@ -183,6 +186,7 @@ public class WalkerTest extends BaseTest {
this.args = args;
this.nOutputFiles = md5s.size();
this.md5s = md5s;
this.testClass = getCallingTestClass();
}
public WalkerTestSpec(String args, List<String> exts, List<String> md5s) {
@ -194,12 +198,22 @@ public class WalkerTest extends BaseTest {
this.nOutputFiles = md5s.size();
this.md5s = md5s;
this.exts = exts;
this.testClass = getCallingTestClass();
}
public WalkerTestSpec(String args, int nOutputFiles, Class expectedException) {
this.args = args;
this.nOutputFiles = nOutputFiles;
this.expectedException = expectedException;
this.testClass = getCallingTestClass();
}
private Class getCallingTestClass() {
return JVMUtils.getCallingClass(getClass());
}
public String getTestClassName() {
return testClass.getSimpleName();
}
public String getArgsWithImplicitArgs() {
@ -306,7 +320,7 @@ public class WalkerTest extends BaseTest {
if ( spec.expectsException() ) {
// this branch handles the case were we are testing that a walker will fail as expected
return executeTest(name, spec.getOutputFileLocation(), null, tmpFiles, args, spec.getExpectedException());
return executeTest(name, spec.getTestClassName(), spec.getOutputFileLocation(), null, tmpFiles, args, spec.getExpectedException());
} else {
List<String> md5s = new LinkedList<String>();
md5s.addAll(spec.md5s);
@ -316,7 +330,7 @@ public class WalkerTest extends BaseTest {
md5s.add(md5);
tmpFiles.add(spec.auxillaryFiles.get(md5));
}
return executeTest(name, spec.getOutputFileLocation(), md5s, tmpFiles, args, null);
return executeTest(name, spec.getTestClassName(), spec.getOutputFileLocation(), md5s, tmpFiles, args, null);
}
}
@ -337,35 +351,37 @@ public class WalkerTest extends BaseTest {
/**
* execute the test, given the following:
* @param name the name of the test
* @param testName the name of the test
* @param testClassName the name of the class that contains the test
* @param md5s the list of md5s
* @param tmpFiles the temp file corresponding to the md5 list
* @param args the argument list
* @param expectedException the expected exception or null
* @return a pair of file and string lists
*/
private Pair<List<File>, List<String>> executeTest(String name, File outputFileLocation, List<String> md5s, List<File> tmpFiles, String args, Class expectedException) {
if ( md5s != null ) qcMD5s(name, md5s);
private Pair<List<File>, List<String>> executeTest(String testName, String testClassName, File outputFileLocation, List<String> md5s, List<File> tmpFiles, String args, Class expectedException) {
if ( md5s != null ) qcMD5s(testName, md5s);
if (outputFileLocation != null)
args += " -o " + outputFileLocation.getAbsolutePath();
executeTest(name, args, expectedException);
executeTest(testName, testClassName, args, expectedException);
if ( expectedException != null ) {
return null;
} else {
// we need to check MD5s
return new Pair<List<File>, List<String>>(tmpFiles, assertMatchingMD5s(name, tmpFiles, md5s));
return new Pair<List<File>, List<String>>(tmpFiles, assertMatchingMD5s(testName, testClassName, tmpFiles, md5s));
}
}
/**
* execute the test, given the following:
* @param name the name of the test
* @param args the argument list
* @param testName the name of the test
* @param testClassName the name of the class that contains the test
* @param args the argument list
* @param expectedException the expected exception or null
*/
private void executeTest(String name, String args, Class expectedException) {
private void executeTest(String testName, String testClassName, String args, Class expectedException) {
CommandLineGATK instance = new CommandLineGATK();
String[] command = Utils.escapeExpressions(args);
@ -374,7 +390,7 @@ public class WalkerTest extends BaseTest {
try {
final String now = new SimpleDateFormat("HH:mm:ss").format(new Date());
final String cmdline = Utils.join(" ",command);
System.out.println(String.format("[%s] Executing test %s with GATK arguments: %s", now, name, cmdline));
System.out.println(String.format("[%s] Executing test %s:%s with GATK arguments: %s", now, testClassName, testName, cmdline));
// also write the command line to the HTML log for convenient follow-up
// do the replaceAll so paths become relative to the current
BaseTest.log(cmdline.replaceAll(publicTestDirRoot, "").replaceAll(privateTestDirRoot, ""));
@ -388,8 +404,8 @@ public class WalkerTest extends BaseTest {
// it's the type we expected
//System.out.println(String.format(" => %s PASSED", name));
} else {
final String message = String.format("Test %s expected exception %s but instead got %s with error message %s",
name, expectedException, e.getClass(), e.getMessage());
final String message = String.format("Test %s:%s expected exception %s but instead got %s with error message %s",
testClassName, testName, expectedException, e.getClass(), e.getMessage());
if ( e.getCause() != null ) {
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final PrintStream ps = new PrintStream(baos);
@ -409,7 +425,7 @@ public class WalkerTest extends BaseTest {
if ( expectedException != null ) {
if ( ! gotAnException )
// we expected an exception but didn't see it
Assert.fail(String.format("Test %s expected exception %s but none was thrown", name, expectedException.toString()));
Assert.fail(String.format("Test %s:%s expected exception %s but none was thrown", testClassName, testName, expectedException.toString()));
} else {
if ( CommandLineExecutable.result != 0) {
throw new RuntimeException("Error running the GATK with arguments: " + args);

View File

@ -0,0 +1,101 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileSpan;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.FileNotFoundException;
import java.util.*;
public class ActiveRegionShardBalancerUnitTest extends BaseTest {
// example genome loc parser for this test, can be deleted if you don't use the reference
private GenomeLocParser genomeLocParser;
protected SAMDataSource readsDataSource;
@BeforeClass
public void setup() throws FileNotFoundException {
// sequence
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(10, 0, 10000);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
readsDataSource = null;
}
@Test
public void testMergingManyContigs() {
executeTest(genomeLocParser.getContigs().getSequences());
}
@Test
public void testMergingAllPointersOnSingleContig() {
executeTest(Arrays.asList(genomeLocParser.getContigs().getSequences().get(1)));
}
@Test
public void testMergingMultipleDiscontinuousContigs() {
final List<SAMSequenceRecord> all = genomeLocParser.getContigs().getSequences();
executeTest(Arrays.asList(all.get(1), all.get(3)));
}
private void executeTest(final Collection<SAMSequenceRecord> records) {
final ActiveRegionShardBalancer balancer = new ActiveRegionShardBalancer();
final List<Set<GenomeLoc>> expectedLocs = new LinkedList<>();
final List<FilePointer> pointers = new LinkedList<>();
for ( final SAMSequenceRecord record : records ) {
final int size = 10;
int end = 0;
for ( int i = 0; i < record.getSequenceLength(); i += size) {
final int myEnd = i + size - 1;
end = myEnd;
final GenomeLoc loc = genomeLocParser.createGenomeLoc(record.getSequenceName(), i, myEnd);
final Map<SAMReaderID, SAMFileSpan> fileSpans = Collections.emptyMap();
final FilePointer fp = new FilePointer(fileSpans, Collections.singletonList(loc));
pointers.add(fp);
}
expectedLocs.add(Collections.singleton(genomeLocParser.createGenomeLoc(record.getSequenceName(), 0, end)));
}
balancer.initialize(readsDataSource, pointers.iterator(), genomeLocParser);
int i = 0;
int nShardsFound = 0;
for ( final Shard shard : balancer ) {
nShardsFound++;
Assert.assertEquals(new HashSet<>(shard.getGenomeLocs()), expectedLocs.get(i++));
}
Assert.assertEquals(nShardsFound, records.size(), "Didn't find exactly one shard for each contig in the sequence dictionary");
}
}

View File

@ -77,7 +77,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
@DataProvider(name = "TraversalEngineProvider")
public Object[][] makeTraversals() {
final List<Object[]> traversals = new LinkedList<Object[]>();
traversals.add(new Object[]{new TraverseActiveRegions<Integer, Integer>()});
traversals.add(new Object[]{new TraverseActiveRegions<>()});
return traversals.toArray(new Object[][]{});
}
@ -490,7 +490,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
traverseActiveRegions.initialize(engine, walker);
List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {
for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer())) {
for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) {
providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
}
@ -523,8 +523,8 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
final int maxTests = Integer.MAX_VALUE;
int nTests = 0;
for ( final int readLength : Arrays.asList(10, 100) ) {
for ( final int skips : Arrays.asList(0, 1, 10) ) {
for ( final int readLength : Arrays.asList(100) ) {
for ( final int skips : Arrays.asList(0, 10) ) {
for ( final int start : starts ) {
for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) {
for ( final int nLoci : Arrays.asList(1, 1000) ) {
@ -536,7 +536,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) {
nTests++;
if ( nTests < maxTests ) // && nTests == 1238 )
tests.add(new Object[]{nTests, activeRegions, readStates, bamBuilder});
tests.add(new Object[]{new TraverseActiveRegions<>(), nTests, activeRegions, readStates, bamBuilder});
}
}
}
@ -586,7 +586,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
@Test(enabled = true && ! DEBUG, dataProvider = "CombinatorialARTTilingProvider")
public void testARTReadsInActiveRegions(final int id, final GenomeLocSortedSet activeRegions, final EnumSet<ActiveRegionReadState> readStates, final ArtificialBAMBuilder bamBuilder) {
public void testARTReadsInActiveRegions(final TraverseActiveRegions<Integer, Integer> traversal, final int id, final GenomeLocSortedSet activeRegions, final EnumSet<ActiveRegionReadState> readStates, final ArtificialBAMBuilder bamBuilder) {
logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder);
final List<GenomeLoc> intervals = Arrays.asList(
genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())
@ -595,7 +595,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false);
walker.setStates(readStates);
final TraverseActiveRegions traversal = new TraverseActiveRegions<Integer, Integer>();
final Map<GenomeLoc, ActiveRegion> activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile());
final Set<String> alreadySeenReads = new HashSet<String>(); // for use with the primary / non-primary
@ -640,8 +639,8 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
//
// ---------------------------------------------------------------------------------------------------------
@Test(enabled = true && ! DEBUG)
public void ensureAllInsertionReadsAreInActiveRegions() {
@Test(dataProvider = "TraversalEngineProvider", enabled = true && ! DEBUG)
public void ensureAllInsertionReadsAreInActiveRegions(final TraverseActiveRegions<Integer, Integer> traversal) {
final int readLength = 10;
final int start = 20;
@ -667,7 +666,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false);
final TraverseActiveRegions traversal = new TraverseActiveRegions<Integer, Integer>();
final Map<GenomeLoc, ActiveRegion> activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile());
final ActiveRegion region = activeRegionsMap.values().iterator().next();

View File

@ -29,6 +29,7 @@ import org.apache.commons.io.FileUtils;
import org.broadinstitute.sting.utils.io.IOUtils;
import org.testng.Assert;
import org.broadinstitute.sting.BaseTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
@ -189,4 +190,50 @@ public class UtilsUnitTest extends BaseTest {
final String sourceString = FileUtils.readFileToString(source);
Assert.assertEquals(Utils.calcMD5(sourceString), sourceMD5);
}
@Test
public void testLongestCommonOps() {
for ( int prefixLen = 0; prefixLen < 20; prefixLen++ ) {
for ( int extraSeq1Len = 0; extraSeq1Len < 10; extraSeq1Len++ ) {
for ( int extraSeq2Len = 0; extraSeq2Len < 10; extraSeq2Len++ ) {
for ( int max = 0; max < 50; max++ ) {
final String prefix = Utils.dupString("A", prefixLen);
final int expected = Math.min(prefixLen, max);
{
final String seq1 = prefix + Utils.dupString("C", extraSeq1Len);
final String seq2 = prefix + Utils.dupString("G", extraSeq1Len);
Assert.assertEquals(Utils.longestCommonPrefix(seq1.getBytes(), seq2.getBytes(), max), expected, "LongestCommonPrefix failed: seq1 " + seq1 + " seq2 " + seq2 + " max " + max);
}
{
final String seq1 = Utils.dupString("C", extraSeq1Len) + prefix;
final String seq2 = Utils.dupString("G", extraSeq1Len) + prefix;
Assert.assertEquals(Utils.longestCommonSuffix(seq1.getBytes(), seq2.getBytes(), max), expected, "longestCommonSuffix failed: seq1 " + seq1 + " seq2 " + seq2 + " max " + max);
}
}
}
}
}
}
@DataProvider(name = "trim")
public Object[][] createTrimTestData() {
List<Object[]> tests = new ArrayList<Object[]>();
final String s = "AAAA";
for ( int front = 0; front < s.length(); front++ ) {
for ( int back = 0; back < s.length(); back++ ) {
if ( front + back <= s.length() )
tests.add(new Object[]{s, front, back});
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "trim", enabled = true)
public void testTrim(final String s, final int frontTrim, final int backTrim) {
Assert.assertEquals(s.length() - frontTrim - backTrim, Utils.trimArray(s.getBytes(), frontTrim, backTrim).length);
}
}

View File

@ -0,0 +1,75 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.classloader;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
public class JVMUtilsUnitTest {
// Test classes used by the tests for JVMUtils.getCallingClass():
private static class DummyTestClass1 {
public static Class getCaller( final Class callee ) {
return DummyTestClass2.getCaller(callee);
}
}
private static class DummyTestClass2 {
public static Class getCaller( final Class callee ) {
return DummyTestClass3.getCaller(callee);
}
}
private static class DummyTestClass3 {
public static Class getCaller( final Class callee ) {
return JVMUtils.getCallingClass(callee);
}
}
@DataProvider( name = "TestGetCallingClassDataProvider" )
public Object[][] getTestCallingClassTestData() {
return new Object[][] {
{ DummyTestClass1.class, JVMUtilsUnitTest.class },
{ DummyTestClass2.class, DummyTestClass1.class },
{ DummyTestClass3.class, DummyTestClass2.class }
};
}
@Test( dataProvider = "TestGetCallingClassDataProvider" )
public void testGetCallingClass( final Class callee, final Class expectedCaller ) {
final Class reportedCaller = DummyTestClass1.getCaller(callee);
Assert.assertEquals(reportedCaller, expectedCaller,
String.format("Wrong calling class returned from DummyTestClass1.getCaller(%s)", callee.getSimpleName()));
}
@Test( expectedExceptions = IllegalArgumentException.class )
public void testGetCallingClassCalleeNotFound() {
// Trying to get the calling class of a class not on the runtime stack should produce an exception.
JVMUtils.getCallingClass(DummyTestClass1.class);
}
}

View File

@ -33,8 +33,10 @@ import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
@ -86,6 +88,30 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@DataProvider(name = "ClippedReadLengthData")
public Object[][] makeClippedReadLengthData() {
List<Object[]> tests = new ArrayList<Object[]>();
// this functionality can be adapted to provide input data for whatever you might want in your data
final int originalReadLength = 50;
for ( int nToClip = 1; nToClip < originalReadLength - 1; nToClip++ ) {
tests.add(new Object[]{originalReadLength, nToClip});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "ClippedReadLengthData", enabled = true)
public void testHardClipReadLengthIsRight(final int originalReadLength, final int nToClip) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(originalReadLength + "M");
read.getReadLength(); // provoke the caching of the read length
final int expectedReadLength = originalReadLength - nToClip;
GATKSAMRecord clipped = ReadClipper.hardClipByReadCoordinates(read, 0, nToClip - 1);
Assert.assertEquals(clipped.getReadLength(), expectedReadLength,
String.format("Clipped read length %d with cigar %s not equal to the expected read length %d after clipping %d bases from the left from a %d bp read with cigar %s",
clipped.getReadLength(), clipped.getCigar(), expectedReadLength, nToClip, read.getReadLength(), read.getCigar()));
}
@Test(enabled = true)
public void testHardClipByReferenceCoordinates() {
for (Cigar cigar : cigarList) {

View File

@ -0,0 +1,88 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.smithwaterman;
import com.google.caliper.Param;
import com.google.caliper.SimpleBenchmark;
import org.broadinstitute.sting.utils.Utils;
/**
* Caliper microbenchmark of parsing a VCF file
*/
public class SmithWatermanBenchmark extends SimpleBenchmark {
@Param({"Original", "Greedy"})
String version; // set automatically by framework
@Param({"10", "50", "100", "500"})
int sizeOfMiddleRegion; // set automatically by framework
@Param({"10", "50", "100", "500"})
int sizeOfEndRegions; // set automatically by framework
String refString;
String hapString;
@Override protected void setUp() {
final StringBuilder ref = new StringBuilder();
final StringBuilder hap = new StringBuilder();
ref.append(Utils.dupString('A', sizeOfEndRegions));
hap.append(Utils.dupString('A', sizeOfEndRegions));
// introduce a SNP
ref.append("X");
hap.append("Y");
ref.append(Utils.dupString('A', sizeOfMiddleRegion));
hap.append(Utils.dupString('A', sizeOfMiddleRegion));
// introduce a SNP
ref.append("X");
hap.append("Y");
ref.append(Utils.dupString('A', sizeOfEndRegions));
hap.append(Utils.dupString('A', sizeOfEndRegions));
refString = ref.toString();
hapString = hap.toString();
}
public void timeSW(int rep) {
for ( int i = 0; i < rep; i++ ) {
final SmithWaterman sw;
if ( version.equals("Greedy") )
sw = new GlobalEdgeGreedySWPairwiseAlignment(refString.getBytes(), hapString.getBytes());
else
sw = new SWPairwiseAlignment(refString.getBytes(), hapString.getBytes());
sw.getCigar();
}
}
public static void main(String[] args) {
com.google.caliper.Runner.main(SmithWatermanBenchmark.class, args);
}
}

View File

@ -113,7 +113,7 @@ object PipelineTest extends BaseTest with Logging {
private def assertMatchingMD5s(name: String, fileMD5s: Traversable[(File, String)], parameterize: Boolean) {
var failed = 0
for ((file, expectedMD5) <- fileMD5s) {
val calculatedMD5 = md5DB.testFileMD5(name, file, expectedMD5, parameterize)
val calculatedMD5 = md5DB.testFileMD5(name, "", file, expectedMD5, parameterize).actualMD5
if (!parameterize && expectedMD5 != "" && expectedMD5 != calculatedMD5)
failed += 1
}

View File

@ -1,3 +0,0 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="picard" revision="1.90.1442" status="release" />
</ivy-module>

View File

@ -0,0 +1,3 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="picard" revision="1.91.1453" status="release" />
</ivy-module>

View File

@ -1,3 +0,0 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="sam" revision="1.90.1442" status="release" />
</ivy-module>

View File

@ -0,0 +1,3 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="sam" revision="1.91.1453" status="release" />
</ivy-module>

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="org.broad" module="tribble" revision="1.90.1442" status="integration" />
<info organisation="org.broad" module="tribble" revision="1.91.1453" status="integration" />
</ivy-module>

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="org.broadinstitute" module="variant" revision="1.90.1446" status="integration" />
<info organisation="org.broadinstitute" module="variant" revision="1.91.1453" status="integration" />
</ivy-module>