diff --git a/build.xml b/build.xml
index 56bf4f0cd..2e9df4d5e 100644
--- a/build.xml
+++ b/build.xml
@@ -1031,6 +1031,7 @@
+
@@ -1043,6 +1044,7 @@
+
@@ -1078,6 +1080,7 @@
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
index 0a4899f1c..5a2cdc7a6 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
@@ -219,6 +219,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
+ @Hidden
+ @Argument(fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.")
+ public String FORCE_READGROUP = null;
+
@Hidden
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false)
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
index 71910e566..eb55701ae 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
@@ -64,6 +64,7 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -236,6 +237,15 @@ public class ReduceReads extends ReadWalker, Redu
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
public int downsampleCoverage = 250;
+ /**
+ * Generally, this tool is not meant to be run for more than 1 sample at a time. The one valid exception
+ * brought to our attention by colleagues is the specific case of tumor/normal pairs in cancer analysis.
+ * To prevent users from unintentionally running the tool in a less than ideal manner, we require them
+ * to explicitly enable multi-sample analysis with this argument.
+ */
+ @Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "enable multi-samples reduction for cancer analysis", required = false)
+ public boolean ALLOW_MULTIPLE_SAMPLES = false;
+
@Hidden
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
public boolean nwayout = false;
@@ -294,6 +304,9 @@ public class ReduceReads extends ReadWalker, Redu
if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 )
throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
+ if ( SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()).size() > 1 && !ALLOW_MULTIPLE_SAMPLES )
+ throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis");
+
if ( known.isEmpty() )
knownSnpPositions = null;
else
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
index d3ca037be..8843d6270 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
@@ -877,6 +877,10 @@ public class SlidingWindow {
final int start = region.getStart() - windowHeaderStart;
int stop = region.getStop() - windowHeaderStart;
+ // make sure the bitset is complete given the region (it might not be in multi-sample mode)
+ if ( region.getStop() > markedSites.getStartLocation() + markedSites.getVariantSiteBitSet().length )
+ markSites(region.getStop());
+
CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions);
allReads.addAll(closeVariantRegionResult.reads);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java
new file mode 100644
index 000000000..063e3b218
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java
@@ -0,0 +1,142 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.TreeSet;
+
+/**
+ * Trim down an active region based on a set of variants found across the haplotypes within the region
+ *
+ * User: depristo
+ * Date: 4/27/13
+ * Time: 2:10 PM
+ */
+class ActiveRegionTrimmer {
+ private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class);
+ private final boolean logTrimming;
+ private final int snpPadding, nonSnpPadding, maxDistanceInExtensionForGenotyping;
+ private final GenomeLocParser parser;
+
+ /**
+ * Create a new ActiveRegionTrimmer
+ *
+ * @param logTrimming should we log our trimming events?
+ * @param snpPadding how much bp context should we ensure around snps?
+ * @param nonSnpPadding how much bp context should we ensure around anything not a snp?
+ * @param maxDistanceInExtensionForGenotyping the max extent we are will to go into the extended region of the
+ * origin active region in order to properly genotype events in the
+ * non-extended active region?
+ * @param parser a genome loc parser so we can create genome locs
+ */
+ ActiveRegionTrimmer(boolean logTrimming, int snpPadding, int nonSnpPadding, int maxDistanceInExtensionForGenotyping, GenomeLocParser parser) {
+ if ( snpPadding < 0 ) throw new IllegalArgumentException("snpPadding must be >= 0 but got " + snpPadding);
+ if ( nonSnpPadding < 0 ) throw new IllegalArgumentException("nonSnpPadding must be >= 0 but got " + nonSnpPadding);
+ if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping);
+ if ( parser == null ) throw new IllegalArgumentException("parser cannot be null");
+
+ this.logTrimming = logTrimming;
+ this.snpPadding = snpPadding;
+ this.nonSnpPadding = nonSnpPadding;
+ this.maxDistanceInExtensionForGenotyping = maxDistanceInExtensionForGenotyping;
+ this.parser = parser;
+ }
+
+ /**
+ * Trim down the active region to a region large enough to properly genotype the events found within the active
+ * region span, excluding all variants that only occur within its extended span.
+ *
+ * This function merely creates the region, but it doesn't populate the reads back into the region.
+ *
+ * @param region our full active region
+ * @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position
+ * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully
+ */
+ public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion) {
+ if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region
+ return null;
+
+ final List withinActiveRegion = new LinkedList();
+ int pad = snpPadding;
+ GenomeLoc trimLoc = null;
+ for ( final VariantContext vc : allVariantsWithinExtendedRegion ) {
+ final GenomeLoc vcLoc = parser.createGenomeLoc(vc);
+ if ( region.getLocation().overlapsP(vcLoc) ) {
+ if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding
+ pad = nonSnpPadding;
+ trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc);
+ withinActiveRegion.add(vc);
+ }
+ }
+
+ // we don't actually have anything in the region after removing variants that don't overlap the region's full location
+ if ( trimLoc == null ) return null;
+
+ final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping);
+ final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad);
+ final GenomeLoc finalSpan = maxSpan.intersect(idealSpan);
+
+ final ActiveRegion trimmedRegion = region.trim(finalSpan);
+ if ( logTrimming ) {
+ logger.info("events : " + withinActiveRegion);
+ logger.info("trimLoc : " + trimLoc);
+ logger.info("pad : " + pad);
+ logger.info("idealSpan : " + idealSpan);
+ logger.info("maxSpan : " + maxSpan);
+ logger.info("finalSpan : " + finalSpan);
+ logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size());
+ }
+ return trimmedRegion;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
index 5a5946183..48972dfd5 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
@@ -46,101 +46,53 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
-import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
-import net.sf.samtools.Cigar;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
-import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
-import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
-import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
import org.broadinstitute.sting.utils.MathUtils;
-import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
-import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
-import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
-import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
-import org.broadinstitute.variant.variantcontext.Allele;
-import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
-import java.util.*;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
/**
- * Created by IntelliJ IDEA.
+ * DeBruijn assembler for the HaplotypeCaller
+ *
* User: ebanks, rpoplin
* Date: Mar 14, 2011
*/
-
public class DeBruijnAssembler extends LocalAssemblyEngine {
private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);
- private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
-
// TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should
// TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where
// TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases
- private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25;
+ private final static int NUM_PATHS_PER_GRAPH = 25;
+ private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int GRAPH_KMER_STEP = 6;
- private final boolean debug;
- private final boolean debugGraphTransformations;
private final int minKmer;
- private final boolean allowCyclesInKmerGraphToGeneratePaths;
-
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
-
protected DeBruijnAssembler() {
- this(false, -1, 11, false);
+ this(25, -1);
}
- public DeBruijnAssembler(final boolean debug,
- final int debugGraphTransformations,
- final int minKmer,
- final boolean allowCyclesInKmerGraphToGeneratePaths) {
- super();
- this.debug = debug;
- this.debugGraphTransformations = debugGraphTransformations > 0;
- this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations;
+ public DeBruijnAssembler(final int minKmer, final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) {
+ super(NUM_PATHS_PER_GRAPH);
this.minKmer = minKmer;
- this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths;
+ this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
}
- /**
- * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
- * @param activeRegion ActiveRegion object holding the reads which are to be used during assembly
- * @param refHaplotype reference haplotype object
- * @param fullReferenceWithPadding byte array holding the reference sequence with padding
- * @param refLoc GenomeLoc object corresponding to the reference sequence with padding
- * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
- * @return a non-empty list of all the haplotypes that are produced during assembly
- */
- @Ensures({"result.contains(refHaplotype)"})
- public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype ) {
- if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
- if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
- if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
- if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
-
- // create the graphs
- final List graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
-
- // print the graphs if the appropriate debug option has been turned on
- if( graphWriter != null ) {
- printGraphs(graphs);
- }
-
- // find the best paths in the graphs and return them as haplotypes
- return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
- }
-
- @Requires({"reads != null", "refHaplotype != null"})
- protected List createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) {
+ @Override
+ protected List assemble(final List reads, final Haplotype refHaplotype) {
final List graphs = new LinkedList();
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
@@ -165,10 +117,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
" future subsystem will actually go and error correct the reads");
}
- final SeqGraph seqGraph = toSeqGraph(graph);
+ final SeqGraph seqGraph = cleanupSeqGraph(graph.convertToSequenceGraph());
if ( seqGraph != null ) { // if the graph contains interesting variation from the reference
- sanityCheckReferenceGraph(seqGraph, refHaplotype);
graphs.add(seqGraph);
if ( debugGraphTransformations ) // we only want to use one graph size
@@ -181,69 +132,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
return graphs;
}
- private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) {
- final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
-
- // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm
- // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect
- // TODO -- to anything from one that's actually has good support along the chain but just happens
- // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately
- // TODO -- the pruning algorithm really should be an error correction algorithm that knows more
- // TODO -- about the structure of the data and can differentiate between an infrequent path but
- // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through)
- // TODO -- from a error with lots of weight going along another similar path
- // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
- seqGraph.zipLinearChains();
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
-
- // now go through and prune the graph, removing vertices no longer connected to the reference chain
- // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
- // edges to maintain graph connectivity.
- seqGraph.pruneGraph(pruneFactor);
- seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
-
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
- seqGraph.simplifyGraph();
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
-
- // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
- // happen in cases where for example the reference somehow manages to acquire a cycle, or
- // where the entire assembly collapses back into the reference sequence.
- if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
- return null;
-
- seqGraph.removePathsNotConnectedToRef();
- seqGraph.simplifyGraph();
- if ( seqGraph.vertexSet().size() == 1 ) {
- // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop
- // the code from blowing up.
- // TODO -- ref properties should really be on the vertices, not the graph itself
- final SeqVertex complete = seqGraph.vertexSet().iterator().next();
- final SeqVertex dummy = new SeqVertex("");
- seqGraph.addVertex(dummy);
- seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
- }
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
-
- return seqGraph;
- }
-
- protected void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) {
- if( graph.getReferenceSourceVertex() == null ) {
- throw new IllegalStateException("All reference graphs must have a reference source vertex.");
- }
- if( graph.getReferenceSinkVertex() == null ) {
- throw new IllegalStateException("All reference graphs must have a reference sink vertex.");
- }
- if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) {
- throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path." +
- " graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) +
- " haplotype = " + new String(refHaplotype.getBases())
- );
- }
- }
-
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype ) {
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
@@ -344,290 +232,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
return true;
}
- protected void printGraphs(final List graphs) {
- final int writeFirstGraphWithSizeSmallerThan = 50;
-
- graphWriter.println("digraph assemblyGraphs {");
- for( final SeqGraph graph : graphs ) {
- if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) {
- logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize());
- continue;
- }
-
- graph.printGraph(graphWriter, false, pruneFactor);
-
- if ( debugGraphTransformations )
- break;
- }
-
- graphWriter.println("}");
- }
-
- @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
- @Ensures({"result.contains(refHaplotype)"})
- private List findBestPaths( final List graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
-
- // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
- // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm
- final List returnHaplotypes = new ArrayList();
- refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
- final Cigar c = new Cigar();
- c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
- refHaplotype.setCigar(c);
- returnHaplotypes.add( refHaplotype );
-
- final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
- final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
-
- // for GGA mode, add the desired allele into the haplotype
- for( final VariantContext compVC : activeAllelesToGenotype ) {
- for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
- final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
- addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true );
- }
- }
-
- for( final SeqGraph graph : graphs ) {
- final SeqVertex source = graph.getReferenceSourceVertex();
- final SeqVertex sink = graph.getReferenceSinkVertex();
- if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
-
- final KBestPaths pathFinder = new KBestPaths(allowCyclesInKmerGraphToGeneratePaths);
- for ( final Path path : pathFinder.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH, source, sink) ) {
-// logger.info("Found path " + path);
- Haplotype h = new Haplotype( path.getBases() );
- if( !returnHaplotypes.contains(h) ) {
- final Cigar cigar = path.calculateCigar();
- if( cigar.isEmpty() ) {
- throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength());
- } else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < 60 ) { // N cigar elements means that a bubble was too divergent from the reference so skip over this path
- continue;
- } else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure
- throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength());
- }
- h.setCigar(cigar);
-
- // extend partial haplotypes which are anchored in the reference to include the full active region
- h = extendPartialHaplotype(h, activeRegionStart, refWithPadding);
- final Cigar leftAlignedCigar = leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(h.getCigar()), refWithPadding, h.getBases(), activeRegionStart, 0);
- if( !returnHaplotypes.contains(h) ) {
- h.setAlignmentStartHapwrtRef(activeRegionStart);
- h.setCigar(leftAlignedCigar);
- h.setScore(path.getScore());
- returnHaplotypes.add(h);
-
- if ( debug )
- logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
-
- // for GGA mode, add the desired allele into the haplotype if it isn't already present
- if( !activeAllelesToGenotype.isEmpty() ) {
- final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
- for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
- final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
-
- // This if statement used to additionally have:
- // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
- // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
- // a haplotype that already contains a 1bp insertion (so practically it is reference but
- // falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
- if( vcOnHaplotype == null ) {
- for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
- addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false );
- }
- }
- }
- }
- }
- }
- }
- }
-
- // add genome locs to the haplotypes
- for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow);
-
- if ( returnHaplotypes.size() < returnHaplotypes.size() )
- logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
-
- if( debug ) {
- if( returnHaplotypes.size() > 1 ) {
- logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
- } else {
- logger.info("Found only the reference haplotype in the assembly graph.");
- }
- for( final Haplotype h : returnHaplotypes ) {
- logger.info( h.toString() );
- logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() );
- }
- }
-
- return returnHaplotypes;
- }
-
- /**
- * Extend partial haplotypes which are anchored in the reference to include the full active region
- * @param haplotype the haplotype to extend
- * @param activeRegionStart the place where the active region starts in the ref byte array
- * @param refWithPadding the full reference byte array with padding which encompasses the active region
- * @return a haplotype fully extended to encompass the active region
- */
- @Requires({"haplotype != null", "activeRegionStart >= 0", "refWithPadding != null", "refWithPadding.length > 0"})
- @Ensures({"result != null", "result.getCigar() != null"})
- private Haplotype extendPartialHaplotype( final Haplotype haplotype, final int activeRegionStart, final byte[] refWithPadding ) {
- final Cigar cigar = haplotype.getCigar();
- final Cigar newCigar = new Cigar();
- byte[] newHaplotypeBases = haplotype.getBases();
- int refPos = activeRegionStart;
- int hapPos = 0;
- for( int iii = 0; iii < cigar.getCigarElements().size(); iii++ ) {
- final CigarElement ce = cigar.getCigarElement(iii);
- switch (ce.getOperator()) {
- case M:
- refPos += ce.getLength();
- hapPos += ce.getLength();
- newCigar.add(ce);
- break;
- case I:
- hapPos += ce.getLength();
- newCigar.add(ce);
- break;
- case D:
- if( iii == 0 || iii == cigar.getCigarElements().size() - 1 ) {
- newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos),
- ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()),
- Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length)));
- hapPos += ce.getLength();
- refPos += ce.getLength();
- newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M));
- } else {
- refPos += ce.getLength();
- newCigar.add(ce);
- }
- break;
- default:
- throw new IllegalStateException("Unsupported cigar operator detected: " + ce.getOperator());
- }
- }
- final Haplotype returnHaplotype = new Haplotype(newHaplotypeBases, haplotype.isReference());
- returnHaplotype.setCigar( newCigar );
- return returnHaplotype;
- }
-
- /**
- * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal
- * @param c the cigar to test
- * @return true if we should skip over this path
- */
- @Requires("c != null")
- private boolean pathIsTooDivergentFromReference( final Cigar c ) {
- for( final CigarElement ce : c.getCigarElements() ) {
- if( ce.getOperator().equals(CigarOperator.N) ) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them.
- * This is a target of future work to incorporate and generalize into AlignmentUtils for use by others.
- * @param cigar the cigar to left align
- * @param refSeq the reference byte array
- * @param readSeq the read byte array
- * @param refIndex 0-based alignment start position on ref
- * @param readIndex 0-based alignment start position on read
- * @return the left-aligned cigar
- */
- @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"})
- protected Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
- final Cigar cigarToReturn = new Cigar();
- Cigar cigarToAlign = new Cigar();
- for (int i = 0; i < cigar.numCigarElements(); i++) {
- final CigarElement ce = cigar.getCigarElement(i);
- if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) {
- cigarToAlign.add(ce);
- final Cigar leftAligned = AlignmentUtils.leftAlignSingleIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false);
- for ( final CigarElement toAdd : leftAligned.getCigarElements() ) { cigarToReturn.add(toAdd); }
- refIndex += cigarToAlign.getReferenceLength();
- readIndex += cigarToAlign.getReadLength();
- cigarToAlign = new Cigar();
- } else {
- cigarToAlign.add(ce);
- }
- }
- if( !cigarToAlign.isEmpty() ) {
- for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) {
- cigarToReturn.add(toAdd);
- }
- }
-
- final Cigar result = AlignmentUtils.consolidateCigar(cigarToReturn);
- if( result.getReferenceLength() != cigar.getReferenceLength() )
- throw new IllegalStateException("leftAlignCigarSequentially failed to produce a valid CIGAR. Reference lengths differ. Initial cigar " + cigar + " left aligned into " + result);
- return result;
- }
-
- /**
- * Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype.
- * Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information.
- * This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based.
- * @param haplotype the candidate haplotype
- * @param ref the reference bases to align against
- * @param haplotypeList the current list of haplotypes
- * @param activeRegionStart the start of the active region in the reference byte array
- * @param activeRegionStop the stop of the active region in the reference byte array
- * @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists
- * @return true if the candidate haplotype was successfully incorporated into the haplotype list
- */
- @Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"})
- private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final List haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
- if( haplotype == null ) { return false; }
-
- final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS );
- haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
-
- if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
- return false;
- }
-
- haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) );
-
- final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true);
- int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
- if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
- hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
- }
- byte[] newHaplotypeBases;
- // extend partial haplotypes to contain the full active region sequence
- if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
- newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
- haplotype.getBases()),
- ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
- } else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
- newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
- } else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
- newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
- } else {
- newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
- }
-
- final Haplotype h = new Haplotype( newHaplotypeBases );
- final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS );
-
- h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
- if ( haplotype.isArtificialHaplotype() ) {
- h.setArtificialEvent(haplotype.getArtificialEvent());
- }
- if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
- return false;
- }
-
- h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) );
-
- if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) {
- haplotypeList.add(h);
- return true;
- } else {
- return false;
- }
+ @Override
+ public String toString() {
+ return "DeBruijnAssembler{" +
+ "minKmer=" + minKmer +
+ '}';
}
}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 6ea543f25..f065a0d7d 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -68,6 +68,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
@@ -135,10 +136,14 @@ import java.util.*;
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.LOCUS)
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
-@ActiveRegionTraversalParameters(extension=200, maxRegion=300)
+@ActiveRegionTraversalParameters(extension=100, maxRegion=300)
@ReadFilters({HCMappingQualityFilter.class})
@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250)
-public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible {
+public class HaplotypeCaller extends ActiveRegionWalker, Integer> implements AnnotatorCompatible, NanoSchedulable {
+ // -----------------------------------------------------------------------------------------------
+ // general haplotype caller arguments
+ // -----------------------------------------------------------------------------------------------
+
/**
* A raw, unfiltered, highly sensitive callset in VCF format.
*/
@@ -185,64 +190,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false)
public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES;
- /**
- * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
- */
- @Advanced
- @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
- public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
-
- @Hidden
- @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
- protected String keepRG = null;
-
- @Advanced
- @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
- protected int MIN_PRUNE_FACTOR = 0;
-
- @Advanced
- @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
- protected int gcpHMM = 10;
-
- @Advanced
- @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
- protected int maxNumHaplotypesInPopulation = 25;
-
- @Advanced
- @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false)
- protected int minKmer = 11;
-
- /**
- * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling
- * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the
- * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking
- * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads,
- * and may make use of them in assembly and calling, where possible.
- */
- @Hidden
- @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false)
- protected boolean includeUnmappedReads = false;
-
- @Advanced
- @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
- protected boolean USE_ALLELES_TRIGGER = false;
-
- @Advanced
- @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false)
- protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false;
-
- @Hidden
- @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
- protected boolean justDetermineActiveRegions = false;
-
- @Hidden
- @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false)
- protected boolean dontGenotype = false;
-
- @Hidden
- @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
- protected boolean errorCorrectKmers = false;
-
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP is not used in any way for the calculations themselves.
@@ -282,10 +229,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));
- @Advanced
- @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false)
- protected boolean mergeVariantsViaLD = false;
-
/**
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
*/
@@ -295,13 +238,139 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@ArgumentCollection
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
+ // -----------------------------------------------------------------------------------------------
+ // arguments to control internal behavior of the debruijn assembler
+ // -----------------------------------------------------------------------------------------------
+
+ @Advanced
+ @Argument(fullName="useDebruijnAssembler", shortName="useDebruijnAssembler", doc="If specified, we will use the old DeBruijn assembler. Depreciated as of 2.6", required = false)
+ protected boolean useDebruijnAssembler = false;
+
+ @Advanced
+ @Argument(fullName="minKmerForDebruijnAssembler", shortName="minKmerForDebruijnAssembler", doc="Minimum kmer length to use in the debruijn assembly graph", required = false)
+ protected int minKmerForDebruijnAssembler = 11;
+
+ @Advanced
+ @Argument(fullName="onlyUseKmerSizeForDebruijnAssembler", shortName="onlyUseKmerSizeForDebruijnAssembler", doc="If specified, we will only build kmer graphs with this kmer size in the debruijn", required = false)
+ protected int onlyUseKmerSizeForDebruijnAssembler = -1;
+
+ // -----------------------------------------------------------------------------------------------
+ // arguments to control internal behavior of the read threading assembler
+ // -----------------------------------------------------------------------------------------------
+
+ @Advanced
+ @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
+ protected List kmerSizes = Arrays.asList(10, 25);
+
+ /**
+ * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype
+ * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the
+ * run of the haplotype caller we only take maxPathsPerSample * nSample paths from the graph, in order of their
+ * weights, no matter how many paths are possible to generate from the graph. Putting this number too low
+ * will result in dropping true variation because paths that include the real variant are not even considered.
+ */
+ @Advanced
+ @Argument(fullName="maxPathsPerSample", shortName="maxPathsPerSample", doc="Max number of paths to consider for the read threading assembler per sample.", required = false)
+ protected int maxPathsPerSample = 10;
+
+ /**
+ * The minimum number of paths to advance forward for genotyping, regardless of the
+ * number of samples
+ */
+ private final static int MIN_PATHS_PER_GRAPH = 128;
+
+ @Hidden
+ @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false)
+ protected boolean dontRecoverDanglingTails = false;
+
+ // -----------------------------------------------------------------------------------------------
+ // general advanced arguments to control haplotype caller behavior
+ // -----------------------------------------------------------------------------------------------
+
+ @Advanced
+ @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
+ protected int MIN_PRUNE_FACTOR = 2;
+
+ @Advanced
+ @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
+ protected int gcpHMM = 10;
+
+ /**
+ * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling
+ * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the
+ * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking
+ * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads,
+ * and may make use of them in assembly and calling, where possible.
+ */
+ @Hidden
+ @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false)
+ protected boolean includeUnmappedReads = false;
+
+ @Advanced
+ @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
+ protected boolean USE_ALLELES_TRIGGER = false;
+
+ @Advanced
+ @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false)
+ protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false;
+
+ /**
+ * The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their
+ * mapping quality. This term effects the probability that a read originated from the reference haplotype, regardless of
+ * its edit distance from the reference, in that the read could have originated from the reference haplotype but
+ * from another location in the genome. Suppose a read has many mismatches from the reference, say like 5, but
+ * has a very high mapping quality of 60. Without this parameter, the read would contribute 5 * Q30 evidence
+ * in favor of its 5 mismatch haplotype compared to reference, potentially enough to make a call off that single
+ * read for all of these events. With this parameter set to Q30, though, the maximum evidence against the reference
+ * that this (and any) read could contribute against reference is Q30.
+ *
+ * Set this term to any negative number to turn off the global mapping rate
+ */
+ @Advanced
+ @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false)
+ protected int phredScaledGlobalReadMismappingRate = 60;
+
+ @Advanced
+ @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
+ protected int maxNumHaplotypesInPopulation = 25;
+
+ @Advanced
+ @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false)
+ protected boolean mergeVariantsViaLD = false;
+
+ // -----------------------------------------------------------------------------------------------
+ // arguments for debugging / developing the haplotype caller
+ // -----------------------------------------------------------------------------------------------
+ /**
+ * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
+ */
+ @Hidden
+ @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
+ public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
+
+ @Hidden
+ @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
+ protected String keepRG = null;
+
+ @Hidden
+ @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
+ protected boolean justDetermineActiveRegions = false;
+
+ @Hidden
+ @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false)
+ protected boolean dontGenotype = false;
+
+ @Hidden
+ @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
+ protected boolean errorCorrectKmers = false;
+
@Advanced
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
protected boolean DEBUG;
- @Advanced
+ @Hidden
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false)
- protected int debugGraphTransformations = -1;
+ protected boolean debugGraphTransformations = false;
@Hidden // TODO -- not currently useful
@Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false)
@@ -311,10 +380,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false)
protected boolean dontTrimActiveRegions = false;
+ @Hidden
+ @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
+ protected boolean dontUseSoftClippedBases = false;
+
@Hidden
@Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false)
protected boolean allowCyclesInKmerGraphToGeneratePaths = false;
+ // -----------------------------------------------------------------------------------------------
+ // done with Haplotype caller parameters
+ // -----------------------------------------------------------------------------------------------
// the UG engines
private UnifiedGenotyperEngine UG_engine = null;
@@ -344,12 +420,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
// the maximum extent into the full active region extension that we're willing to go in genotyping our events
private final static int MAX_GENOTYPING_ACTIVE_REGION_EXTENSION = 25;
+ private ActiveRegionTrimmer trimmer = null;
+
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
// bases with quality less than or equal to this value are trimmed off the tails of the reads
private static final byte MIN_TAIL_QUALITY = 20;
+ // the minimum length of a read we'd consider using for genotyping
+ private final static int MIN_READ_LENGTH = 10;
+
private List samplesList = new ArrayList();
private final static double LOG_ONE_HALF = -Math.log10(2.0);
private final static double LOG_ONE_THIRD = -Math.log10(3.0);
@@ -373,6 +454,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
// get all of the unique sample names
Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
samplesList.addAll( samples );
+ final int nSamples = samples.size();
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
@@ -428,14 +510,36 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
}
- // setup the assembler
- assemblyEngine = new DeBruijnAssembler(DEBUG, debugGraphTransformations, minKmer, allowCyclesInKmerGraphToGeneratePaths);
+ // create and setup the assembler
+ final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
+ assemblyEngine = useDebruijnAssembler
+ ? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
+ : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes);
+
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
+ assemblyEngine.setDebug(DEBUG);
+ assemblyEngine.setDebugGraphTransformations(debugGraphTransformations);
+ assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths);
+ assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails);
+
if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter);
if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1);
- likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
+ // setup the likelihood calculation engine
+ if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1;
+
+ // configure the global mismapping rate
+ final double log10GlobalReadMismappingRate;
+ if ( phredScaledGlobalReadMismappingRate < 0 ) {
+ log10GlobalReadMismappingRate = - Double.MAX_VALUE;
+ } else {
+ log10GlobalReadMismappingRate = QualityUtils.qualToErrorProbLog10(phredScaledGlobalReadMismappingRate);
+ logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units");
+ }
+
+ // create our likelihood calculation engine
+ likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate );
final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes();
@@ -443,6 +547,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
if ( bamWriter != null )
haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader());
+
+ trimmer = new ActiveRegionTrimmer(DEBUG, PADDING_AROUND_SNPS_FOR_CALLING, PADDING_AROUND_OTHERS_FOR_CALLING,
+ MAX_GENOTYPING_ACTIVE_REGION_EXTENSION, getToolkit().getGenomeLocParser());
}
//---------------------------------------------------------------------------------------------------------------
@@ -538,15 +645,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
//
//---------------------------------------------------------------------------------------------------------------
+ private final static List NO_CALLS = Collections.emptyList();
@Override
- public Integer map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) {
+ public List map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) {
if ( justDetermineActiveRegions )
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
- return 1;
+ return NO_CALLS;
- if( !originalActiveRegion.isActive() ) { return 0; } // Not active so nothing to do!
+ if( !originalActiveRegion.isActive() ) { return NO_CALLS; } // Not active so nothing to do!
- final List activeAllelesToGenotype = new ArrayList();
+ final List activeAllelesToGenotype = new ArrayList<>();
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
for( final VariantContext vc : allelesToGenotype ) {
if( originalActiveRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) {
@@ -555,23 +663,23 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
}
allelesToGenotype.removeAll( activeAllelesToGenotype );
// No alleles found in this region so nothing to do!
- if ( activeAllelesToGenotype.isEmpty() ) { return 0; }
+ if ( activeAllelesToGenotype.isEmpty() ) { return NO_CALLS; }
} else {
- if( originalActiveRegion.size() == 0 ) { return 0; } // No reads here so nothing to do!
+ if( originalActiveRegion.size() == 0 ) { return NO_CALLS; } // No reads here so nothing to do!
}
// run the local assembler, getting back a collection of information on how we should proceed
final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
// abort early if something is out of the acceptable range
- if( assemblyResult.haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
- if (dontGenotype) return 1; // user requested we not proceed
+ if( ! assemblyResult.isVariationPresent() ) { return NO_CALLS; } // only the reference haplotype remains so nothing else to do!
+ if (dontGenotype) return NO_CALLS; // user requested we not proceed
// filter out reads from genotyping which fail mapping quality based criteria
final List filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads );
- if( assemblyResult.regionForGenotyping.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
+ if( assemblyResult.regionForGenotyping.size() == 0 ) { return NO_CALLS; } // no reads remain after filtering so nothing else to do!
// evaluate each sample's reads against all haplotypes
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
@@ -590,12 +698,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
getToolkit().getGenomeLocParser(),
activeAllelesToGenotype );
- for( final VariantContext call : calledHaplotypes.getCalls() ) {
- // TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker.
- // annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call);
- vcfWriter.add( call );
- }
-
+ // TODO -- must disable if we are doing NCT, or set the output type of ! presorted
if ( bamWriter != null ) {
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
bestHaplotypes,
@@ -605,7 +708,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
- return 1; // One active region was processed during this map call
+ return calledHaplotypes.getCalls();
}
private final static class AssemblyResult {
@@ -613,12 +716,18 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
final ActiveRegion regionForGenotyping;
final byte[] fullReferenceWithPadding;
final GenomeLoc paddedReferenceLoc;
+ final boolean variationPresent;
- private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc) {
+ private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) {
this.haplotypes = haplotypes;
this.regionForGenotyping = regionForGenotyping;
this.fullReferenceWithPadding = fullReferenceWithPadding;
this.paddedReferenceLoc = paddedReferenceLoc;
+ this.variationPresent = variationPresent;
+ }
+
+ public boolean isVariationPresent() {
+ return variationPresent && haplotypes.size() > 1;
}
}
@@ -644,63 +753,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
if ( ! dontTrimActiveRegions ) {
return trimActiveRegion(activeRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc);
} else {
- // we don't want to or cannot create a trimmed active region, so go ahead and use the old one
- return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc);
+ // we don't want to trim active regions, so go ahead and use the old one
+ return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
}
}
- /**
- * Trim down the active region to just enough to properly genotype the events among the haplotypes
- *
- * This function merely creates the region, but it doesn't populate the reads back into the region
- *
- * @param region our full active region
- * @param haplotypes the list of haplotypes we've created from assembly
- * @param ref the reference bases over the full padded location
- * @param refLoc the span of the reference bases
- * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully
- */
- private ActiveRegion createTrimmedRegion(final ActiveRegion region, final List haplotypes, final byte[] ref, final GenomeLoc refLoc) {
- EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG);
- final TreeSet allContexts = EventMap.getAllVariantContexts(haplotypes);
- final GenomeLocParser parser = getToolkit().getGenomeLocParser();
-
- if ( allContexts.isEmpty() ) // no variants, so just return the current region
- return null;
-
- final List withinActiveRegion = new LinkedList();
- int pad = PADDING_AROUND_SNPS_FOR_CALLING;
- GenomeLoc trimLoc = null;
- for ( final VariantContext vc : allContexts ) {
- final GenomeLoc vcLoc = parser.createGenomeLoc(vc);
- if ( region.getLocation().overlapsP(vcLoc) ) {
- if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding
- pad = PADDING_AROUND_OTHERS_FOR_CALLING;
- trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc);
- withinActiveRegion.add(vc);
- }
- }
-
- // we don't actually have anything in the region after removing variants that don't overlap the region's full location
- if ( trimLoc == null ) return null;
-
- final GenomeLoc maxSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(region.getLocation(), MAX_GENOTYPING_ACTIVE_REGION_EXTENSION);
- final GenomeLoc idealSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(trimLoc, pad);
- final GenomeLoc finalSpan = maxSpan.intersect(idealSpan);
-
- final ActiveRegion trimmedRegion = region.trim(finalSpan);
- if ( DEBUG ) {
- logger.info("events : " + withinActiveRegion);
- logger.info("trimLoc : " + trimLoc);
- logger.info("pad : " + pad);
- logger.info("idealSpan : " + idealSpan);
- logger.info("maxSpan : " + maxSpan);
- logger.info("finalSpan : " + finalSpan);
- logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size());
- }
- return trimmedRegion;
- }
-
/**
* Trim down the active region to just enough to properly genotype the events among the haplotypes
*
@@ -709,17 +766,24 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
* @param fullReferenceWithPadding the reference bases over the full padded location
* @param paddedReferenceLoc the span of the reference bases
* @return an AssemblyResult containing the trimmed active region with all of the reads we should use
- * trimmed down as well, and a revised set of haplotypes. If trimming failed this function
- * may choose to use the originalActiveRegion without modification
+ * trimmed down as well, and a revised set of haplotypes. If trimming down the active region results
+ * in only the reference haplotype over the non-extended active region, returns null.
*/
private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion,
final List haplotypes,
final byte[] fullReferenceWithPadding,
final GenomeLoc paddedReferenceLoc) {
- final ActiveRegion trimmedActiveRegion = createTrimmedRegion(originalActiveRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc);
+ if ( DEBUG ) logger.info("Trimming active region " + originalActiveRegion + " with " + haplotypes.size() + " haplotypes");
- if ( trimmedActiveRegion == null )
- return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc);
+ EventMap.buildEventMapsForHaplotypes(haplotypes, fullReferenceWithPadding, paddedReferenceLoc, DEBUG);
+ final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypes);
+ final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion);
+
+ if ( trimmedActiveRegion == null ) {
+ // there were no variants found within the active region itself, so just return null
+ if ( DEBUG ) logger.info("No variation found within the active region, skipping the region :-)");
+ return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, false);
+ }
// trim down the haplotypes
final Set haplotypeSet = new HashSet(haplotypes.size());
@@ -738,8 +802,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() );
+ if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
if ( DEBUG ) {
- logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
for ( final Haplotype remaining: trimmedHaplotypes ) {
logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar());
}
@@ -757,7 +821,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
trimmedActiveRegion.clearReads();
trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads));
- return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc);
+ return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
}
/**
@@ -787,8 +851,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
}
@Override
- public Integer reduce(Integer cur, Integer sum) {
- return cur + sum;
+ public Integer reduce(List callsInRegion, Integer numCalledRegions) {
+ for( final VariantContext call : callsInRegion ) {
+ // TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker.
+ // annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call);
+ vcfWriter.add( call );
+ }
+ return (callsInRegion.isEmpty() ? 0 : 1) + numCalledRegions;
}
@Override
@@ -804,7 +873,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
- final List finalizedReadList = new ArrayList();
+ final List finalizedReadList = new ArrayList<>();
final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
activeRegion.clearReads();
@@ -815,21 +884,23 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
}
// Loop through the reads hard clipping the adaptor and low quality tails
- final List readsToUse = new ArrayList(finalizedReadList.size());
+ final List readsToUse = new ArrayList<>(finalizedReadList.size());
for( final GATKSAMRecord myRead : finalizedReadList ) {
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
GATKSAMRecord clippedRead = useLowQualityBasesForAssembly ? postAdapterRead : ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
- // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
- // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
- // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
- // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
- // TODO -- reference haplotype start must be removed
- clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
-
- // uncomment to remove hard clips from consideration at all
- //clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
+ if ( dontUseSoftClippedBases ) {
+ // uncomment to remove hard clips from consideration at all
+ clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
+ } else {
+ // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
+ // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
+ // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
+ // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
+ // TODO -- reference haplotype start must be removed
+ clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
+ }
clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() );
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
@@ -843,13 +914,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
}
private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
- final List readsToRemove = new ArrayList();
+ final List readsToRemove = new ArrayList<>();
+// logger.info("Filtering non-passing regions: n incoming " + activeRegion.getReads().size());
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
- if( rec.getReadLength() < 10 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
+ if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
readsToRemove.add(rec);
+// logger.info("\tremoving read " + rec + " len " + rec.getReadLength());
}
}
activeRegion.removeAll( readsToRemove );
+// logger.info("Filtered non-passing regions: n remaining " + activeRegion.getReads().size());
return readsToRemove;
}
@@ -864,7 +938,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
for( final String sample : samplesList) {
List readList = returnMap.get( sample );
if( readList == null ) {
- readList = new ArrayList();
+ readList = new ArrayList<>();
returnMap.put(sample, readList);
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java
index a7194f85f..aad8407dd 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java
@@ -46,9 +46,7 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.*;
/**
* generic utility class that counts kmers
@@ -97,6 +95,20 @@ public class KMerCounter {
return countsByKMer.values();
}
+ /**
+ * Get kmers that have minCount or greater in this counter
+ * @param minCount only return kmers with count >= this value
+ * @return a non-null collection of kmers
+ */
+ public Collection getKmersWithCountsAtLeast(final int minCount) {
+ final List result = new LinkedList();
+ for ( final CountedKmer countedKmer : getCountedKmers() ) {
+ if ( countedKmer.count >= minCount )
+ result.add(countedKmer.kmer);
+ }
+ return result;
+ }
+
/**
* Remove all current counts, resetting the counter to an empty state
*/
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java
index 9b0e1ac0a..745d4de06 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java
@@ -149,6 +149,14 @@ public class Kmer {
return bases;
}
+ /**
+ * Get a string representation of the bases of this kmer
+ * @return a non-null string
+ */
+ public String baseString() {
+ return new String(bases());
+ }
+
/**
* The length of this kmer
* @return an integer >= 0
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index 8697833a6..d5d5f3c09 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -69,35 +69,54 @@ public class LikelihoodCalculationEngine {
private static final double LOG_ONE_HALF = -Math.log10(2.0);
private final byte constantGCP;
+ private final double log10globalReadMismappingRate;
private final boolean DEBUG;
- private final PairHMM pairHMM;
- private final int minReadLength = 20;
+ private final PairHMM.HMM_IMPLEMENTATION hmmType;
+
+ private final ThreadLocal pairHMM = new ThreadLocal() {
+ @Override
+ protected PairHMM initialValue() {
+ switch (hmmType) {
+ case EXACT: return new Log10PairHMM(true);
+ case ORIGINAL: return new Log10PairHMM(false);
+ case LOGLESS_CACHING: return new LoglessPairHMM();
+ default:
+ throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
+ }
+ }
+ };
/**
* The expected rate of random sequencing errors for a read originating from its true haplotype.
*
* For example, if this is 0.01, then we'd expect 1 error per 100 bp.
*/
- private final double EXPECTED_ERROR_RATE_PER_BASE = 0.02;
-
- public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
-
- switch (hmmType) {
- case EXACT:
- pairHMM = new Log10PairHMM(true);
- break;
- case ORIGINAL:
- pairHMM = new Log10PairHMM(false);
- break;
- case LOGLESS_CACHING:
- pairHMM = new LoglessPairHMM();
- break;
- default:
- throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
- }
+ private final static double EXPECTED_ERROR_RATE_PER_BASE = 0.02;
+ /**
+ * Create a new LikelihoodCalculationEngine using provided parameters and hmm to do its calculations
+ *
+ * @param constantGCP the gap continuation penalty to use with the PairHMM
+ * @param debug should we emit debugging information during the calculation?
+ * @param hmmType the type of the HMM to use
+ * @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of
+ * -3 means that the chance that a read doesn't actually belong at this
+ * location in the genome is 1 in 1000. The effect of this parameter is
+ * to cap the maximum likelihood difference between the reference haplotype
+ * and the best alternative haplotype by -3 log units. So if the best
+ * haplotype is at -10 and this parameter has a value of -3 then even if the
+ * reference haplotype gets a score of -100 from the pairhmm it will be
+ * assigned a likelihood of -13.
+ */
+ public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate ) {
+ this.hmmType = hmmType;
this.constantGCP = constantGCP;
- DEBUG = debug;
+ this.DEBUG = debug;
+ this.log10globalReadMismappingRate = log10globalReadMismappingRate;
+ }
+
+ public LikelihoodCalculationEngine() {
+ this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3);
}
/**
@@ -124,7 +143,7 @@ public class LikelihoodCalculationEngine {
}
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
- pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
+ pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
}
public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) {
@@ -132,9 +151,8 @@ public class LikelihoodCalculationEngine {
initializePairHMM(haplotypes, perSampleReadList);
// Add likelihoods for each sample's reads to our stratifiedReadMap
- final Map stratifiedReadMap = new HashMap();
+ final Map stratifiedReadMap = new LinkedHashMap<>();
for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) {
- //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
// evaluate the likelihood of the reads given those haplotypes
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
@@ -152,17 +170,16 @@ public class LikelihoodCalculationEngine {
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) {
// first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time)
final int numHaplotypes = haplotypes.size();
- final Map alleleVersions = new HashMap(numHaplotypes);
+ final Map alleleVersions = new LinkedHashMap<>(numHaplotypes);
+ Allele refAllele = null;
for ( final Haplotype haplotype : haplotypes ) {
- alleleVersions.put(haplotype, Allele.create(haplotype, true));
+ final Allele allele = Allele.create(haplotype, true);
+ alleleVersions.put(haplotype, allele);
+ if ( haplotype.isReference() ) refAllele = allele;
}
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for( final GATKSAMRecord read : reads ) {
- if ( read.getReadLength() < minReadLength )
- // don't consider any reads that have a read length < the minimum
- continue;
-
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
@@ -177,14 +194,34 @@ public class LikelihoodCalculationEngine {
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
}
+ // keep track of the reference likelihood and the best non-ref likelihood
+ double refLog10l = Double.NEGATIVE_INFINITY;
+ double bestNonReflog10L = Double.NEGATIVE_INFINITY;
+
+ // iterate over all haplotypes, calculating the likelihood of the read for each haplotype
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj);
final boolean isFirstHaplotype = jjj == 0;
- final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
+ final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
+ if ( haplotype.isNonReference() )
+ bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
+ else
+ refLog10l = log10l;
+
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
}
+
+ // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
+ // mismapping rate. This protects us from the case where the assembly has produced haplotypes
+ // that are very divergent from reference, but are supported by only one read. In effect
+ // we capping how badly scoring the reference can be for any read by the chance that the read
+ // itself just doesn't belong here
+ final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
+ if ( refLog10l < (worstRefLog10Allowed) ) {
+ perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
+ }
}
return perReadAlleleLikelihoodMap;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
index 4c0483ad6..20b005b40 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
@@ -46,28 +46,388 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
+import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
+import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
+import java.io.File;
import java.io.PrintStream;
-import java.util.List;
+import java.util.*;
/**
- * Created by IntelliJ IDEA.
+ * Abstract base class for all HaplotypeCaller assemblers
+ *
* User: ebanks
* Date: Mar 14, 2011
*/
public abstract class LocalAssemblyEngine {
- public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
+ private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class);
+
+ public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
+ private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30;
+
+ protected final int numBestHaplotypesPerGraph;
+
+ protected boolean debug = false;
+ protected boolean allowCyclesInKmerGraphToGeneratePaths = false;
+ protected boolean debugGraphTransformations = false;
+ protected boolean recoverDanglingTails = true;
- protected PrintStream graphWriter = null;
protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE;
protected int pruneFactor = 2;
protected boolean errorCorrectKmers = false;
- protected LocalAssemblyEngine() { }
+ private PrintStream graphWriter = null;
+
+ /**
+ * Create a new LocalAssemblyEngine with all default parameters, ready for use
+ * @param numBestHaplotypesPerGraph the number of haplotypes to generate for each assembled graph
+ */
+ protected LocalAssemblyEngine(final int numBestHaplotypesPerGraph) {
+ if ( numBestHaplotypesPerGraph < 1 ) throw new IllegalArgumentException("numBestHaplotypesPerGraph should be >= 1 but got " + numBestHaplotypesPerGraph);
+ this.numBestHaplotypesPerGraph = numBestHaplotypesPerGraph;
+ }
+
+ /**
+ * Main subclass function: given reads and a reference haplotype give us graphs to use for constructing
+ * non-reference haplotypes.
+ *
+ * @param reads the reads we're going to assemble
+ * @param refHaplotype the reference haplotype
+ * @return a non-null list of reads
+ */
+ protected abstract List assemble(List reads, Haplotype refHaplotype);
+
+ /**
+ * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
+ * @param activeRegion ActiveRegion object holding the reads which are to be used during assembly
+ * @param refHaplotype reference haplotype object
+ * @param fullReferenceWithPadding byte array holding the reference sequence with padding
+ * @param refLoc GenomeLoc object corresponding to the reference sequence with padding
+ * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
+ * @return a non-empty list of all the haplotypes that are produced during assembly
+ */
+ public List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List activeAllelesToGenotype) {
+ if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
+ if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
+ if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
+ if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
+
+ // create the graphs by calling our subclass assemble method
+ final List graphs = assemble(activeRegion.getReads(), refHaplotype);
+
+ // do some QC on the graphs
+ for ( final SeqGraph graph : graphs ) { sanityCheckGraph(graph, refHaplotype); }
+
+ // print the graphs if the appropriate debug option has been turned on
+ if ( graphWriter != null ) { printGraphs(graphs); }
+
+ // find the best paths in the graphs and return them as haplotypes
+ return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
+ }
+
+ @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
+ @Ensures({"result.contains(refHaplotype)"})
+ protected List findBestPaths(final List graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow) {
+ // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
+ final Set returnHaplotypes = new LinkedHashSet();
+ refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
+ final Cigar c = new Cigar();
+ c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
+ refHaplotype.setCigar(c);
+ returnHaplotypes.add( refHaplotype );
+
+ final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
+ final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
+
+ // for GGA mode, add the desired allele into the haplotype
+ for( final VariantContext compVC : activeAllelesToGenotype ) {
+ for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
+ final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
+ addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true );
+ }
+ }
+
+ for( final SeqGraph graph : graphs ) {
+ final SeqVertex source = graph.getReferenceSourceVertex();
+ final SeqVertex sink = graph.getReferenceSinkVertex();
+ if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
+
+ final KBestPaths pathFinder = new KBestPaths(allowCyclesInKmerGraphToGeneratePaths);
+ for ( final Path path : pathFinder.getKBestPaths(graph, numBestHaplotypesPerGraph, source, sink) ) {
+// logger.info("Found path " + path);
+ Haplotype h = new Haplotype( path.getBases() );
+ if( !returnHaplotypes.contains(h) ) {
+ final Cigar cigar = path.calculateCigar(refHaplotype.getBases());
+
+ if ( cigar == null ) {
+ // couldn't produce a meaningful alignment of haplotype to reference, fail quitely
+ continue;
+ } else if( cigar.isEmpty() ) {
+ throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() +
+ " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength());
+ } else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < MIN_HAPLOTYPE_REFERENCE_LENGTH ) {
+ // N cigar elements means that a bubble was too divergent from the reference so skip over this path
+ continue;
+ } else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure
+ throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length "
+ + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength()
+ + " ref = " + refHaplotype + " path " + new String(path.getBases()));
+ }
+
+ h.setCigar(cigar);
+ h.setAlignmentStartHapwrtRef(activeRegionStart);
+ h.setScore(path.getScore());
+ returnHaplotypes.add(h);
+
+ if ( debug )
+ logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
+
+ // for GGA mode, add the desired allele into the haplotype if it isn't already present
+ if( !activeAllelesToGenotype.isEmpty() ) {
+ final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
+ for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
+ final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
+
+ // This if statement used to additionally have:
+ // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
+ // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
+ // a haplotype that already contains a 1bp insertion (so practically it is reference but
+ // falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
+ if( vcOnHaplotype == null ) {
+ for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
+ addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false );
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // add genome locs to the haplotypes
+ for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow);
+
+ if ( returnHaplotypes.size() < returnHaplotypes.size() )
+ logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
+
+ if( debug ) {
+ if( returnHaplotypes.size() > 1 ) {
+ logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
+ } else {
+ logger.info("Found only the reference haplotype in the assembly graph.");
+ }
+ for( final Haplotype h : returnHaplotypes ) {
+ logger.info( h.toString() );
+ logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() + " ref " + h.isReference());
+ }
+ }
+
+ return new ArrayList(returnHaplotypes);
+ }
+
+ /**
+ * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal
+ * @param c the cigar to test
+ * @return true if we should skip over this path
+ */
+ @Requires("c != null")
+ private boolean pathIsTooDivergentFromReference( final Cigar c ) {
+ for( final CigarElement ce : c.getCigarElements() ) {
+ if( ce.getOperator().equals(CigarOperator.N) ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype.
+ * Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information.
+ * This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based.
+ * @param haplotype the candidate haplotype
+ * @param ref the reference bases to align against
+ * @param haplotypeList the current list of haplotypes
+ * @param activeRegionStart the start of the active region in the reference byte array
+ * @param activeRegionStop the stop of the active region in the reference byte array
+ * @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists
+ * @return true if the candidate haplotype was successfully incorporated into the haplotype list
+ */
+ @Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"})
+ private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final Set haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
+ if( haplotype == null ) { return false; }
+
+ final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS );
+ haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
+
+ if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
+ return false;
+ }
+
+ haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) );
+
+ final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true);
+ int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
+ if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
+ hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
+ }
+ byte[] newHaplotypeBases;
+ // extend partial haplotypes to contain the full active region sequence
+ if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+ newHaplotypeBases = ArrayUtils.addAll(ArrayUtils.addAll(ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
+ haplotype.getBases()),
+ ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop));
+ } else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+ newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
+ } else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+ newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
+ } else {
+ newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
+ }
+
+ final Haplotype h = new Haplotype( newHaplotypeBases );
+ final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS );
+
+ h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
+ if ( haplotype.isArtificialHaplotype() ) {
+ h.setArtificialEvent(haplotype.getArtificialEvent());
+ }
+ if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
+ return false;
+ }
+
+ h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) );
+
+ if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) {
+ haplotypeList.add(h);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
+ if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
+
+ // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm
+ // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect
+ // TODO -- to anything from one that's actually has good support along the chain but just happens
+ // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately
+ // TODO -- the pruning algorithm really should be an error correction algorithm that knows more
+ // TODO -- about the structure of the data and can differentiate between an infrequent path but
+ // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through)
+ // TODO -- from a error with lots of weight going along another similar path
+ // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
+ seqGraph.zipLinearChains();
+ if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
+
+ // now go through and prune the graph, removing vertices no longer connected to the reference chain
+ // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
+ // edges to maintain graph connectivity.
+ seqGraph.pruneGraph(pruneFactor);
+ seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
+
+ if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
+ seqGraph.simplifyGraph();
+ if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
+
+ // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
+ // happen in cases where for example the reference somehow manages to acquire a cycle, or
+ // where the entire assembly collapses back into the reference sequence.
+ if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
+ return null;
+
+ seqGraph.removePathsNotConnectedToRef();
+ seqGraph.simplifyGraph();
+ if ( seqGraph.vertexSet().size() == 1 ) {
+ // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop
+ // the code from blowing up.
+ // TODO -- ref properties should really be on the vertices, not the graph itself
+ final SeqVertex complete = seqGraph.vertexSet().iterator().next();
+ final SeqVertex dummy = new SeqVertex("");
+ seqGraph.addVertex(dummy);
+ seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
+ }
+ if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
+
+ return seqGraph;
+ }
+
+ /**
+ * Perform general QC on the graph to make sure something hasn't gone wrong during assembly
+ * @param graph the graph to check
+ * @param refHaplotype the reference haplotype
+ * @param
+ */
+ private void sanityCheckGraph(final BaseGraph graph, final Haplotype refHaplotype) {
+ sanityCheckReferenceGraph(graph, refHaplotype);
+ }
+
+ /**
+ * Make sure the reference sequence is properly represented in the provided graph
+ *
+ * @param graph the graph to check
+ * @param refHaplotype the reference haplotype
+ * @param
+ */
+ private void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) {
+ if( graph.getReferenceSourceVertex() == null ) {
+ throw new IllegalStateException("All reference graphs must have a reference source vertex.");
+ }
+ if( graph.getReferenceSinkVertex() == null ) {
+ throw new IllegalStateException("All reference graphs must have a reference sink vertex.");
+ }
+ if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) {
+ throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path. for graph " + graph +
+ " graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) +
+ " haplotype = " + new String(refHaplotype.getBases())
+ );
+ }
+ }
+
+ /**
+ * Print the generated graphs to the graphWriter
+ * @param graphs a non-null list of graphs to print out
+ */
+ private void printGraphs(final List graphs) {
+ final int writeFirstGraphWithSizeSmallerThan = 50;
+
+ graphWriter.println("digraph assemblyGraphs {");
+ for( final SeqGraph graph : graphs ) {
+ if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) {
+ logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize());
+ continue;
+ }
+
+ graph.printGraph(graphWriter, false, pruneFactor);
+
+ if ( debugGraphTransformations )
+ break;
+ }
+
+ graphWriter.println("}");
+ }
+
+ // -----------------------------------------------------------------------------------------------
+ //
+ // getter / setter routines for generic assembler properties
+ //
+ // -----------------------------------------------------------------------------------------------
public int getPruneFactor() {
return pruneFactor;
@@ -85,10 +445,6 @@ public abstract class LocalAssemblyEngine {
this.errorCorrectKmers = errorCorrectKmers;
}
- public PrintStream getGraphWriter() {
- return graphWriter;
- }
-
public void setGraphWriter(PrintStream graphWriter) {
this.graphWriter = graphWriter;
}
@@ -101,5 +457,35 @@ public abstract class LocalAssemblyEngine {
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
}
- public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List activeAllelesToGenotype);
+ public boolean isDebug() {
+ return debug;
+ }
+
+ public void setDebug(boolean debug) {
+ this.debug = debug;
+ }
+
+ public boolean isAllowCyclesInKmerGraphToGeneratePaths() {
+ return allowCyclesInKmerGraphToGeneratePaths;
+ }
+
+ public void setAllowCyclesInKmerGraphToGeneratePaths(boolean allowCyclesInKmerGraphToGeneratePaths) {
+ this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths;
+ }
+
+ public boolean isDebugGraphTransformations() {
+ return debugGraphTransformations;
+ }
+
+ public void setDebugGraphTransformations(boolean debugGraphTransformations) {
+ this.debugGraphTransformations = debugGraphTransformations;
+ }
+
+ public boolean isRecoverDanglingTails() {
+ return recoverDanglingTails;
+ }
+
+ public void setRecoverDanglingTails(boolean recoverDanglingTails) {
+ this.recoverDanglingTails = recoverDanglingTails;
+ }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java
index be5a431c4..a6ef0d1c2 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java
@@ -76,12 +76,10 @@ public class BaseEdge {
}
/**
- * Copy constructor
- *
- * @param toCopy
+ * Create a new copy of this BaseEdge
*/
- public BaseEdge(final BaseEdge toCopy) {
- this(toCopy.isRef(), toCopy.getMultiplicity());
+ public BaseEdge copy() {
+ return new BaseEdge(isRef(), getMultiplicity());
}
/**
@@ -92,6 +90,34 @@ public class BaseEdge {
return multiplicity;
}
+ /**
+ * Get the DOT format label for this edge, to be displayed when printing this edge to a DOT file
+ * @return a non-null string
+ */
+ public String getDotLabel() {
+ return Integer.toString(getMultiplicity());
+ }
+
+ /**
+ * Increase the multiplicity of this edge by incr
+ * @param incr the change in this multiplicity, must be >= 0
+ */
+ public void incMultiplicity(final int incr) {
+ if ( incr < 0 ) throw new IllegalArgumentException("incr must be >= 0 but got " + incr);
+ multiplicity += incr;
+ }
+
+ /**
+ * A special assessor that returns the multiplicity that should be used by pruning algorithm
+ *
+ * Can be overloaded by subclasses
+ *
+ * @return the multiplicity value that should be used for pruning
+ */
+ public int getPruningMultiplicity() {
+ return getMultiplicity();
+ }
+
/**
* Set the multiplicity of this edge to value
* @param value an integer >= 0
@@ -117,23 +143,6 @@ public class BaseEdge {
this.isRef = isRef;
}
- /**
- * Does this and edge have the same source and target vertices in graph?
- *
- * @param graph the graph containing both this and edge
- * @param edge our comparator edge
- * @param
- * @return true if we have the same source and target vertices
- */
- public boolean hasSameSourceAndTarget(final BaseGraph graph, final BaseEdge edge) {
- return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
- }
-
- // For use when comparing edges across graphs!
- public boolean seqEquals( final BaseGraph graph, final BaseEdge edge, final BaseGraph graph2 ) {
- return (graph.getEdgeSource(this).seqEquals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).seqEquals(graph2.getEdgeTarget(edge)));
- }
-
/**
* Sorts a collection of BaseEdges in decreasing order of weight, so that the most
* heavily weighted is at the start of the list
@@ -187,4 +196,12 @@ public class BaseEdge {
if ( edge == null ) throw new IllegalArgumentException("edge cannot be null");
return new BaseEdge(isRef() || edge.isRef(), Math.max(getMultiplicity(), edge.getMultiplicity()));
}
+
+ @Override
+ public String toString() {
+ return "BaseEdge{" +
+ "multiplicity=" + multiplicity +
+ ", isRef=" + isRef +
+ '}';
+ }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
index 7ce57e2e7..8938af7c2 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
@@ -66,34 +66,16 @@ import java.util.*;
* Date: 2/6/13
*/
@Invariant("!this.isAllowingMultipleEdges()")
-public class BaseGraph extends DefaultDirectedGraph {
+public class BaseGraph extends DefaultDirectedGraph {
protected final static Logger logger = Logger.getLogger(BaseGraph.class);
private final int kmerSize;
- /**
- * Construct an empty BaseGraph
- */
- public BaseGraph() {
- this(11);
- }
-
- /**
- * Edge factory that creates non-reference multiplicity 1 edges
- * @param the new of our vertices
- */
- private static class MyEdgeFactory implements EdgeFactory {
- @Override
- public BaseEdge createEdge(T sourceVertex, T targetVertex) {
- return new BaseEdge(false, 1);
- }
- }
-
/**
* Construct a DeBruijnGraph with kmerSize
* @param kmerSize
*/
- public BaseGraph(final int kmerSize) {
- super(new MyEdgeFactory());
+ public BaseGraph(final int kmerSize, final EdgeFactory edgeFactory) {
+ super(edgeFactory);
if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize);
this.kmerSize = kmerSize;
@@ -111,7 +93,7 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph getSources() {
- final Set set = new LinkedHashSet();
- for ( final T v : vertexSet() )
+ public Set getSources() {
+ final Set set = new LinkedHashSet();
+ for ( final V v : vertexSet() )
if ( isSource(v) )
set.add(v);
return set;
@@ -153,9 +135,9 @@ public class BaseGraph extends DefaultDirectedGraph getSinks() {
- final Set set = new LinkedHashSet();
- for ( final T v : vertexSet() )
+ public Set getSinks() {
+ final Set set = new LinkedHashSet();
+ for ( final V v : vertexSet() )
if ( isSink(v) )
set.add(v);
return set;
@@ -167,7 +149,7 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph vertices) {
- for ( final T v : vertices )
+ public void addVertices(final Collection vertices) {
+ for ( final V v : vertices )
addVertex(v);
}
@@ -349,8 +341,12 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph extends DefaultDirectedGraph outgoingVerticesOf(final T v) {
- final Set s = new LinkedHashSet();
- for ( final BaseEdge e : outgoingEdgesOf(v) ) {
+ public Set outgoingVerticesOf(final V v) {
+ final Set s = new LinkedHashSet();
+ for ( final E e : outgoingEdgesOf(v) ) {
s.add(getEdgeTarget(e));
}
return s;
@@ -384,9 +380,9 @@ public class BaseGraph extends DefaultDirectedGraph v
*/
- public Set incomingVerticesOf(final T v) {
- final Set s = new LinkedHashSet();
- for ( final BaseEdge e : incomingEdgesOf(v) ) {
+ public Set incomingVerticesOf(final V v) {
+ final Set s = new LinkedHashSet();
+ for ( final E e : incomingEdgesOf(v) ) {
s.add(getEdgeSource(e));
}
return s;
@@ -413,15 +409,16 @@ public class BaseGraph extends DefaultDirectedGraph " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];");
+ for( final E edge : edgeSet() ) {
+ graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getDotLabel() + "\"];");
if( edge.isRef() ) {
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];");
}
}
- for( final T v : vertexSet() ) {
- graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + "\",shape=box]");
+ for( final V v : vertexSet() ) {
+// graphWriter.println("\t" + v.toString() + " [label=\"" + v + "\",shape=box]");
+ graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + v.additionalInfo() + "\",shape=box]");
}
if ( writeHeader )
@@ -439,10 +436,10 @@ public class BaseGraph extends DefaultDirectedGraph edgesToCheck = new HashSet();
+ final Set edgesToCheck = new HashSet();
edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex()));
while( !edgesToCheck.isEmpty() ) {
- final BaseEdge e = edgesToCheck.iterator().next();
+ final E e = edgesToCheck.iterator().next();
if( !e.isRef() ) {
edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) );
removeEdge(e);
@@ -452,7 +449,7 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph edgesToRemove = new ArrayList();
- for( final BaseEdge e : edgeSet() ) {
- if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
+ final List edgesToRemove = new ArrayList<>();
+ for( final E e : edgeSet() ) {
+ if( e.getPruningMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
edgesToRemove.add(e);
}
}
@@ -480,13 +477,25 @@ public class BaseGraph extends DefaultDirectedGraph pruner = new LowWeightChainPruner<>(pruneFactor);
+ pruner.pruneLowWeightChains(this);
+ }
+
/**
* Remove all vertices in the graph that have in and out degree of 0
*/
protected void removeSingletonOrphanVertices() {
// Run through the graph and clean up singular orphaned nodes
- final List verticesToRemove = new LinkedList();
- for( final T v : vertexSet() ) {
+ final List verticesToRemove = new LinkedList<>();
+ for( final V v : vertexSet() ) {
if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) {
verticesToRemove.add(v);
}
@@ -499,11 +508,11 @@ public class BaseGraph extends DefaultDirectedGraph toRemove = new HashSet(vertexSet());
+ final HashSet toRemove = new HashSet<>(vertexSet());
- final T refV = getReferenceSourceVertex();
+ final V refV = getReferenceSourceVertex();
if ( refV != null ) {
- for ( final T v : new BaseGraphIterator(this, refV, true, true) ) {
+ for ( final V v : new BaseGraphIterator<>(this, refV, true, true) ) {
toRemove.remove(v);
}
}
@@ -524,22 +533,31 @@ public class BaseGraph extends DefaultDirectedGraph onPathFromRefSource = new HashSet(vertexSet().size());
- for ( final T v : new BaseGraphIterator(this, getReferenceSourceVertex(), false, true) ) {
+ final Set onPathFromRefSource = new HashSet<>(vertexSet().size());
+ for ( final V v : new BaseGraphIterator<>(this, getReferenceSourceVertex(), false, true) ) {
onPathFromRefSource.add(v);
}
// get the set of vertices we can reach by going backward from the ref sink
- final Set onPathFromRefSink = new HashSet(vertexSet().size());
- for ( final T v : new BaseGraphIterator(this, getReferenceSinkVertex(), true, false) ) {
+ final Set onPathFromRefSink = new HashSet<>(vertexSet().size());
+ for ( final V v : new BaseGraphIterator<>(this, getReferenceSinkVertex(), true, false) ) {
onPathFromRefSink.add(v);
}
// we want to remove anything that's not in both the sink and source sets
- final Set verticesToRemove = new HashSet(vertexSet());
+ final Set verticesToRemove = new HashSet<>(vertexSet());
onPathFromRefSource.retainAll(onPathFromRefSink);
verticesToRemove.removeAll(onPathFromRefSource);
removeAllVertices(verticesToRemove);
+
+ // simple santity checks that this algorithm is working.
+ if ( getSinks().size() > 1 ) {
+ throw new IllegalStateException("Should have eliminated all but the reference sink, but found " + getSinks());
+ }
+
+ if ( getSources().size() > 1 ) {
+ throw new IllegalStateException("Should have eliminated all but the reference source, but found " + getSources());
+ }
}
/**
@@ -555,11 +573,11 @@ public class BaseGraph extends DefaultDirectedGraph the type of the nodes in those graphs
* @return true if g1 and g2 are equals
*/
- public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) {
+ public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) {
final Set vertices1 = g1.vertexSet();
final Set vertices2 = g2.vertexSet();
- final Set edges1 = g1.edgeSet();
- final Set edges2 = g2.edgeSet();
+ final Set edges1 = g1.edgeSet();
+ final Set edges2 = g2.edgeSet();
if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() )
return false;
@@ -571,29 +589,35 @@ public class BaseGraph extends DefaultDirectedGraph graph2 ) {
+ return (this.getEdgeSource(edge1).seqEquals(graph2.getEdgeSource(edge2))) && (this.getEdgeTarget(edge1).seqEquals(graph2.getEdgeTarget(edge2)));
+ }
+
+
/**
* Get the incoming edge of v. Requires that there be only one such edge or throws an error
* @param v our vertex
* @return the single incoming edge to v, or null if none exists
*/
- public BaseEdge incomingEdgeOf(final T v) {
+ public E incomingEdgeOf(final V v) {
return getSingletonEdge(incomingEdgesOf(v));
}
@@ -602,7 +626,7 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph edges) {
+ private E getSingletonEdge(final Collection edges) {
if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges);
return edges.isEmpty() ? null : edges.iterator().next();
}
@@ -625,12 +649,19 @@ public class BaseGraph extends DefaultDirectedGraph implements Iterator, Iterable {
+public class BaseGraphIterator implements Iterator, Iterable {
final HashSet