diff --git a/build.xml b/build.xml
index 2e9df4d5e..16db1cec1 100644
--- a/build.xml
+++ b/build.xml
@@ -887,6 +887,27 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -910,7 +931,7 @@
-
+
diff --git a/licensing/GATK1_LICENSE b/licensing/GATK1_LICENSE
deleted file mode 100644
index 648ec8fc3..000000000
--- a/licensing/GATK1_LICENSE
+++ /dev/null
@@ -1,22 +0,0 @@
-Copyright (c) 2012 The Broad Institute
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
-THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/licensing/GATK2_beta_license.doc b/licensing/GATK2_beta_license.doc
deleted file mode 100644
index 6c12bfe30..000000000
Binary files a/licensing/GATK2_beta_license.doc and /dev/null differ
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
index 5016526c0..c324488c9 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@@ -54,8 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
-import java.io.PrintStream;
-import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -85,9 +83,6 @@ public class StandardCallerArgumentCollection {
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
- @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
- public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
-
/**
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
@@ -150,7 +145,7 @@ public class StandardCallerArgumentCollection {
*/
@Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
- public static final double DEFAULT_CONTAMINATION_FRACTION = 0.05;
+ public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0;
/**
* This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples.
@@ -199,7 +194,6 @@ public class StandardCallerArgumentCollection {
this.heterozygosity = SCAC.heterozygosity;
this.INDEL_HETEROZYGOSITY = SCAC.INDEL_HETEROZYGOSITY;
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
- this.OutputMode = SCAC.OutputMode;
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
index e636f8f17..31fe7e380 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
@@ -320,18 +320,17 @@ public class ReduceReads extends ReadWalker, Redu
if (toolkit.getIntervals() != null)
intervalList.addAll(toolkit.getIntervals());
- final boolean preSorted = true;
final boolean indexOnTheFly = true;
final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
if (nwayout) {
SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
- writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, preSorted, indexOnTheFly, NO_PG_TAG, programRecord, true);
+ writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, false, indexOnTheFly, NO_PG_TAG, programRecord, true);
}
else {
writerToUse = out;
out.setPresorted(false);
if (!NO_PG_TAG) {
- Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME);
+ Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME);
}
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
index 93df9f091..f3b26f295 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
@@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
+import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.genotyper.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@@ -94,7 +95,7 @@ import static java.lang.Math.pow;
*/
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
- public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;
+ public final static double DEFAULT_PCR_ERROR_RATE = FragmentUtils.DEFAULT_PCR_ERROR_RATE;
protected final static int FIXED_PLOIDY = 2;
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index f156468cc..4fae3d6e3 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -214,6 +214,9 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
+ @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
+ public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
+
/**
* Create a new UAC with defaults for all UAC arguments
*/
@@ -262,6 +265,8 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES;
this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO;
this.pairHMM = uac.pairHMM;
+ this.OutputMode = uac.OutputMode;
+
this.annotateAllSitesWithPLs = uac.annotateAllSitesWithPLs;
// todo- arguments to remove
this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java
similarity index 67%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java
index 0f66082c6..f07dbb392 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java
@@ -46,105 +46,44 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
-import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
/**
- * Fast approach to building a DeBruijnGraph
- *
- * Follows the model:
- *
- * for each X that has bases for the final graph:
- * addKmer pair (single kmer with kmer size + 1 spanning the pair)
- *
- * flushKmersToGraph
+ * Result of assembling, with the resulting graph and status
*
* User: depristo
- * Date: 4/7/13
- * Time: 4:14 PM
+ * Date: 7/1/13
+ * Time: 5:35 PM
*/
-public class DeBruijnGraphBuilder {
- /** The size of the kmer graph we want to build */
- private final int kmerSize;
-
- /** The graph we're going to add kmers to */
- private final DeBruijnGraph graph;
-
- /** keeps counts of all kmer pairs added since the last flush */
- private final KMerCounter counter;
+public class AssemblyResult {
+ private final Status status;
+ private final SeqGraph graph;
/**
- * Create a new builder that will write out kmers to graph
- *
- * @param graph a non-null graph that can contain already added kmers
+ * Create a new assembly result
+ * @param status the status, cannot be null
+ * @param graph the resulting graph of the assembly, can only be null if result is failed
*/
- public DeBruijnGraphBuilder(final DeBruijnGraph graph) {
- if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null");
- this.kmerSize = graph.getKmerSize();
+ public AssemblyResult(final Status status, final SeqGraph graph) {
+ if ( status == null ) throw new IllegalArgumentException("status cannot be null");
+ if ( status != Status.FAILED && graph == null ) throw new IllegalArgumentException("graph is null but status is " + status);
+
+ this.status = status;
this.graph = graph;
- this.counter = new KMerCounter(kmerSize + 1);
}
- /**
- * The graph we're building
- * @return a non-null graph
- */
- public DeBruijnGraph getGraph() {
- return graph;
- }
+ public Status getStatus() { return status; }
+ public SeqGraph getGraph() { return graph; }
/**
- * The kmer size of our graph
- * @return positive integer
+ * Status of the assembly result
*/
- public int getKmerSize() {
- return kmerSize;
- }
-
- /**
- * Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this
- * graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The
- * edge between them is added with isRef and multiplicity
- *
- * @param sequence a sequence of bases from which we want to extract a pair of kmers
- * @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize()
- * @param multiplicity what's the multiplicity of the edge between these two kmers
- */
- public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final int multiplicity ) {
- if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null");
- if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start);
- if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length);
- final Kmer kmerPair = new Kmer(sequence, start, getKmerSize() + 1);
- addKmerPair(kmerPair, multiplicity);
- }
-
- /**
- * Add a single kmer pair to this builder
- * @param kmerPair a kmer pair is a single kmer that has kmerSize + 1 bp, where 0 -> kmersize and 1 -> kmersize + 1
- * will have an edge added to this
- * @param multiplicity the desired multiplicity of this edge
- */
- public void addKmerPair(final Kmer kmerPair, final int multiplicity) {
- if ( kmerPair.length() != kmerSize + 1 ) throw new IllegalArgumentException("kmer pair must be of length kmerSize + 1 = " + kmerSize + 1 + " but got " + kmerPair.length());
- counter.addKmer(kmerPair, multiplicity);
- }
-
- /**
- * Flushes the currently added kmers to the graph
- *
- * After this function is called the builder is reset to an empty state
- *
- * This flushing is expensive, so many kmers should be added to the builder before flushing. The most
- * efficient workflow is to add all of the kmers of a particular class (all ref bases, or all read bases)
- * then and do one flush when completed
- *
- * @param addRefEdges should the kmers present in the builder be added to the graph with isRef = true for the edges?
- */
- public void flushKmersToGraph(final boolean addRefEdges) {
- for ( final KMerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) {
- final byte[] first = countedKmer.getKmer().subKmer(0, kmerSize).bases();
- final byte[] second = countedKmer.getKmer().subKmer(1, kmerSize).bases();
- graph.addKmersToGraph(first, second, addRefEdges, countedKmer.getCount());
- }
- counter.clear();
+ public enum Status {
+ /** Something went wrong, and we couldn't produce a meaningful graph */
+ FAILED,
+ /** Assembly succeeded, but graph degenerated into just the reference sequence */
+ JUST_ASSEMBLED_REFERENCE,
+ /** Assembly succeeded, and the graph has some meaningful structure */
+ ASSEMBLED_SOME_VARIATION
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
deleted file mode 100644
index d876a403b..000000000
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
-*/
-
-package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
-
-import com.google.java.contract.Requires;
-import org.apache.log4j.Logger;
-import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
-import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
-import org.broadinstitute.sting.utils.MathUtils;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.haplotype.Haplotype;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.sting.utils.sam.ReadUtils;
-
-import java.io.File;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.LinkedList;
-import java.util.List;
-
-/**
- * DeBruijn assembler for the HaplotypeCaller
- *
- * User: ebanks, rpoplin
- * Date: Mar 14, 2011
- */
-public class DeBruijnAssembler extends LocalAssemblyEngine {
- private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);
-
- // TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should
- // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where
- // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases
- private final static int NUM_PATHS_PER_GRAPH = 25;
- private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
- private static final int GRAPH_KMER_STEP = 6;
- private static final int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
-
- private final int minKmer;
- private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
-
- protected DeBruijnAssembler() {
- this(25, -1);
- }
-
- public DeBruijnAssembler(final int minKmer, final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) {
- super(NUM_PATHS_PER_GRAPH);
- this.minKmer = minKmer;
- this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
- }
-
- @Override
- protected List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) {
- final List graphs = new LinkedList<>();
-
- final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
- if( maxKmer < minKmer) {
- // Reads are too small for assembly so don't try to create any assembly graphs
- return Collections.emptyList();
- }
- // create the graph for each possible kmer
- for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) {
- if ( debugGraphTransformations && kmer > onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms)
- continue;
-
- if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
- DeBruijnGraph graph = createGraphFromSequences(reads, kmer, refHaplotype, activeAlleleHaplotypes);
- if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
- // do a series of steps to clean up the raw assembly graph to make it analysis-ready
- if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
-
- if ( shouldErrorCorrectKmers() ) {
- throw new UserException("Error correction no longer supported because of the " +
- "incredibly naive way this was implemented. The command line argument remains because some" +
- " future subsystem will actually go and error correct the reads");
- }
-
- final SeqGraph seqGraph = cleanupSeqGraph(graph.convertToSequenceGraph());
-
- if ( seqGraph != null ) { // if the graph contains interesting variation from the reference
- graphs.add(seqGraph);
-
- if ( debugGraphTransformations ) // we only want to use one graph size
- break;
- }
- }
-
- }
-
- return graphs;
- }
-
- @Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
- protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) {
- final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
- final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph);
-
- // First pull kmers from the reference haplotype and add them to the graph
- if ( ! addReferenceKmersToGraph(builder, refHaplotype.getBases()) )
- // something went wrong, so abort right now with a null graph
- return null;
-
- // add the artificial GGA haplotypes to the graph
- if ( ! addGGAKmersToGraph(builder, activeAlleleHaplotypes) )
- // something went wrong, so abort right now with a null graph
- return null;
-
- // now go through the graph already seeded with the reference sequence and add the read kmers to it
- if ( ! addReadKmersToGraph(builder, reads) )
- // some problem was detected adding the reads to the graph, return null to indicate we failed
- return null;
-
- graph.cleanNonRefPaths();
- return graph;
- }
-
- /**
- * Add the high-quality kmers from the artificial GGA haplotypes to the graph
- *
- * @param builder a debruijn graph builder to add the read kmers to
- * @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
- * @return true if we successfully added the read kmers to the graph without corrupting it in some way
- */
- protected boolean addGGAKmersToGraph(final DeBruijnGraphBuilder builder, final List activeAlleleHaplotypes) {
-
- final int kmerLength = builder.getKmerSize();
-
- for( final Haplotype haplotype : activeAlleleHaplotypes ) {
- final int end = haplotype.length() - kmerLength;
- for( int start = 0; start < end; start++ ) {
- builder.addKmerPairFromSeqToGraph( haplotype.getBases(), start, GGA_MODE_ARTIFICIAL_COUNTS );
- }
- }
-
- // always returns true now, but it's possible that we'd add kmers and decide we don't like the graph in some way
- return true;
- }
-
- /**
- * Add the high-quality kmers from the reads to the graph
- *
- * @param builder a debruijn graph builder to add the read kmers to
- * @param reads a non-null list of reads whose kmers we want to add to the graph
- * @return true if we successfully added the read kmers to the graph without corrupting it in some way
- */
- protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List reads) {
- final int kmerLength = builder.getKmerSize();
-
- // Next pull kmers out of every read and throw them on the graph
- for( final GATKSAMRecord read : reads ) {
- final byte[] sequence = read.getReadBases();
- final byte[] qualities = read.getBaseQualities();
- final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
- if ( sequence.length > kmerLength + KMER_OVERLAP ) {
- int lastGood = -1; // the index of the last good base we've seen
- for( int end = 0; end < sequence.length; end++ ) {
- if ( qualities[end] < minBaseQualityToUseInAssembly ) {
- lastGood = -1; // reset the last good base
- } else if ( lastGood == -1 ) {
- lastGood = end; // we're at a good base, the last good one is us
- } else if ( end - kmerLength >= lastGood ) {
- // end - kmerLength (the start) is after the lastGood base, so that kmer is good
- final int start = end - kmerLength;
- // how many observations of this kmer have we seen? A normal read counts for 1, but
- // a reduced read might imply a higher multiplicity for our the edge
- int countNumber = 1;
- if ( read.isReducedRead() ) {
- // compute mean number of reduced read counts in current kmer span
- // precise rounding can make a difference with low consensus counts
- // TODO -- optimization: should extend arrayMax function to take start stop values
- countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, start, end));
- }
-
- builder.addKmerPairFromSeqToGraph(sequence, start, countNumber);
- }
- }
- }
- }
-
- builder.flushKmersToGraph(false);
-
- // always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way
- return true;
- }
-
- /**
- * Add the kmers from the reference sequence to the DeBruijnGraph
- *
- * @param builder the graph to add the reference kmers to. Must be empty
- * @param refSequence the reference sequence from which we'll get our kmers
- * @return true if we succeeded in creating a good graph from the reference sequence, false otherwise
- */
- protected boolean addReferenceKmersToGraph(final DeBruijnGraphBuilder builder, final byte[] refSequence) {
- if ( builder == null ) throw new IllegalArgumentException("graph cannot be null");
- if ( builder.getGraph().vertexSet().size() != 0 )
- throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + builder.getGraph().vertexSet().size() + " vertices in it already: " + builder.getGraph());
- if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null");
-
- final int kmerLength = builder.getKmerSize();
- if( refSequence.length < kmerLength + KMER_OVERLAP ) {
- // not enough reference sequence to build a kmer graph of this length, return null
- return false;
- }
-
- final int kmersInSequence = refSequence.length - kmerLength + 1;
- for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
- builder.addKmerPairFromSeqToGraph(refSequence, iii, 1);
- }
- builder.flushKmersToGraph(true);
-
- // we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices
- if ( builder.getGraph().vertexSet().size() != kmersInSequence ) {
- if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping");
- return false;
- }
-
- return true;
- }
-
- @Override
- public String toString() {
- return "DeBruijnAssembler{" +
- "minKmer=" + minKmer +
- '}';
- }
-}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index 04173b64f..82029b872 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -56,6 +56,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.EventMap;
@@ -166,6 +167,8 @@ public class GenotypingEngine {
// Walk along each position in the key set and create each event to be outputted
final Set calledHaplotypes = new HashSet<>();
final List returnCalls = new ArrayList<>();
+ final Map emptyDownSamplingMap = new DefaultHashMap<>(0.0);
+
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
@@ -197,13 +200,13 @@ public class GenotypingEngine {
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
- final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION );
+ final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().getSampleContamination() );
final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC );
final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL);
if( call != null ) {
final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap :
- convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
+ convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) );
final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call);
@@ -406,7 +409,7 @@ public class GenotypingEngine {
// BUGBUG: ugh, too complicated
protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap,
final Map> alleleMapper,
- final double downsamplingFraction ) {
+ final Map perSampleDownsamplingFraction ) {
final Map alleleReadMap = new LinkedHashMap<>();
for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
@@ -423,7 +426,7 @@ public class GenotypingEngine {
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
}
}
- perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling
+ perReadAlleleLikelihoodMap.performPerAlleleDownsampling(perSampleDownsamplingFraction.get(haplotypeReadMapEntry.getKey())); // perform contamination downsampling
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 87f1ae75c..2d492741b 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -47,9 +47,6 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
-import net.sf.samtools.Cigar;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileWriter;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
@@ -58,9 +55,10 @@ import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils;
-import org.broadinstitute.sting.gatk.filters.*;
+import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -73,21 +71,25 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
-import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
+import org.broadinstitute.sting.utils.fragments.FragmentCollection;
+import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.gvcf.GVCFWriter;
import org.broadinstitute.sting.utils.haplotype.*;
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
-import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
@@ -240,22 +242,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
@ArgumentCollection
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
- // -----------------------------------------------------------------------------------------------
- // arguments to control internal behavior of the debruijn assembler
- // -----------------------------------------------------------------------------------------------
-
- @Advanced
- @Argument(fullName="useDebruijnAssembler", shortName="useDebruijnAssembler", doc="If specified, we will use the old DeBruijn assembler. Depreciated as of 2.6", required = false)
- protected boolean useDebruijnAssembler = false;
-
- @Advanced
- @Argument(fullName="minKmerForDebruijnAssembler", shortName="minKmerForDebruijnAssembler", doc="Minimum kmer length to use in the debruijn assembly graph", required = false)
- protected int minKmerForDebruijnAssembler = 11;
-
- @Advanced
- @Argument(fullName="onlyUseKmerSizeForDebruijnAssembler", shortName="onlyUseKmerSizeForDebruijnAssembler", doc="If specified, we will only build kmer graphs with this kmer size in the debruijn", required = false)
- protected int onlyUseKmerSizeForDebruijnAssembler = -1;
-
// -----------------------------------------------------------------------------------------------
// arguments to control internal behavior of the read threading assembler
// -----------------------------------------------------------------------------------------------
@@ -298,7 +284,62 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// -----------------------------------------------------------------------------------------------
@Advanced
- @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
+ @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Emit experimental reference confidence scores", required = false)
+ protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE;
+
+ public enum ReferenceConfidenceMode {
+ NONE,
+ BP_RESOLUTION,
+ GVCF
+ }
+
+ /**
+ * The GQ partition intervals
+ *
+ * Should be a non-empty list of boundaries. For example, suppose this variable is
+ *
+ * [A, B, C]
+ *
+ * We would partition our hom-ref sites into the following bands:
+ *
+ * X < A
+ * A <= X < B
+ * B <= X < C
+ * X >= C
+ *
+ * The default bands give the following GQ blocks:
+ *
+ * [0, 0]
+ * (0, 10]
+ * (10, 20]
+ * (20, 30]
+ * (30, 40]
+ * (40, 50]
+ * (50, 99]
+ *
+ * Note that in the GATK GQ values are capped at 99.
+ */
+ @Advanced
+ @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false)
+ protected List GVCFGQBands = Arrays.asList(1, 10, 20, 30, 40, 50);
+
+ /**
+ * This parameter determines the maximum size of an indel considered as potentially segregating in the
+ * reference model. It is used to eliminate reads from being indel informative at a site, and determines
+ * by that mechanism the certainty in the reference base. Conceptually, setting this parameter to
+ * X means that each informative read is consistent with any indel of size < X being present at a specific
+ * position in the genome, given its alignment to the reference.
+ */
+ @Advanced
+ @Argument(fullName="indelSizeToEliminateInRefModel", shortName="ERCIS", doc="The size of an indel to check for in the reference model", required = false)
+ protected int indelSizeToEliminateInRefModel = 10;
+
+ // -----------------------------------------------------------------------------------------------
+ // general advanced arguments to control haplotype caller behavior
+ // -----------------------------------------------------------------------------------------------
+
+ @Advanced
+ @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 2;
@Advanced
@@ -419,6 +460,27 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
@Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false)
protected int minObservationsForKmerToBeSolid = 20;
+ /**
+ * the maximum extent into the full active region extension that we're willing to go in genotyping our events
+ */
+ @Hidden
+ @Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false)
+ protected int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25;
+
+ @Hidden
+ @Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false)
+ protected int MAX_GGA_ACTIVE_REGION_EXTENSION = 300;
+
+ /**
+ * Include at least this many bases around an event for calling it
+ */
+ @Hidden
+ @Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false)
+ protected int PADDING_AROUND_OTHERS_FOR_CALLING = 150;
+
+ @Hidden
+ @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false)
+ protected int PADDING_AROUND_SNPS_FOR_CALLING = 20;
// -----------------------------------------------------------------------------------------------
// done with Haplotype caller parameters
@@ -445,14 +507,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// reference base padding size
private static final int REFERENCE_PADDING = 500;
- // include at least this many bases around an event for calling it
- private final static int PADDING_AROUND_SNPS_FOR_CALLING = 20;
- private final static int PADDING_AROUND_OTHERS_FOR_CALLING = 150;
-
- // the maximum extent into the full active region extension that we're willing to go in genotyping our events
- private final static int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25;
- private final static int MAX_GGA_ACTIVE_REGION_EXTENSION = 100;
-
private ActiveRegionTrimmer trimmer = null;
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
@@ -470,6 +524,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
+ ReferenceConfidenceModel referenceConfidenceModel = null;
+
//---------------------------------------------------------------------------------------------------------------
//
// initialize
@@ -488,6 +544,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
final int nSamples = samples.size();
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
+ // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this?
+ UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
+ ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
@@ -501,14 +560,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
- // Currently, per-sample contamination level is only implemented for UG
if( UAC.CONTAMINATION_FRACTION_FILE !=null) {
- throw new UserException("Per-Sample contamination level not supported in Haplotype Caller at this point");
+ UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger));
}
- // when we do implement per-sample contamination for HC, this will probably be needed.
- // UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, samples, logger));
-
// initialize the output VCF header
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
@@ -532,6 +587,19 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// where the filters are used. For example, in emitting all sites the lowQual field is used
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
+ referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel);
+ if ( emitReferenceConfidence() ) {
+ if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently");
+ headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines());
+ if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
+ try {
+ vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands);
+ } catch ( IllegalArgumentException e ) {
+ throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage());
+ }
+ }
+ }
+
vcfWriter.writeHeader(new VCFHeader(headerInfo, samples));
try {
@@ -543,9 +611,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// create and setup the assembler
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
- assemblyEngine = useDebruijnAssembler
- ? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
- : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples);
+ assemblyEngine = new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples);
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
@@ -602,7 +668,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
@Override
public EnumSet desiredReadStates() {
if ( includeUnmappedReads ) {
- throw new UserException.BadArgumentValue("includeUmappedReads", "is not yet functional");
+ throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional");
// return EnumSet.of(
// ActiveRegionReadState.PRIMARY,
// ActiveRegionReadState.NONPRIMARY,
@@ -636,38 +702,16 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// if we don't have any data, just abort early
return new ActivityProfileState(ref.getLocus(), 0.0);
- final List noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied
- noCall.add(Allele.NO_CALL);
-
+ final List noCall = Collections.singletonList(Allele.NO_CALL); // used to noCall all genotypes until the exact model is applied
final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage();
for( final Map.Entry sample : splitContexts.entrySet() ) {
- final double[] genotypeLikelihoods = new double[3]; // ref versus non-ref (any event)
- Arrays.fill(genotypeLikelihoods, 0.0);
-
- for( final PileupElement p : sample.getValue().getBasePileup() ) {
- final byte qual = p.getQual();
- if( p.isDeletion() || qual > (byte) 18) {
- int AA = 0; final int AB = 1; int BB = 2;
- if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
- AA = 2;
- BB = 0;
- if( p.isNextToSoftClip() ) {
- averageHQSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
- }
- }
- genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
- genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
- genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
- }
- }
+ final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), (byte) 18, averageHQSoftClips).genotypeLikelihoods;
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
}
- final List alleles = new ArrayList<>();
- alleles.add( FAKE_REF_ALLELE );
- alleles.add( FAKE_ALT_ALLELE );
+ final List alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE);
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
@@ -687,7 +731,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
return NO_CALLS;
- if( !originalActiveRegion.isActive() ) { return NO_CALLS; } // Not active so nothing to do!
+ if( !originalActiveRegion.isActive() ) {
+ // Not active so nothing to do!
+ return referenceModelForNoVariation(originalActiveRegion, true);
+ }
final List activeAllelesToGenotype = new ArrayList<>();
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
@@ -697,23 +744,30 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
}
}
// No alleles found in this region so nothing to do!
- if ( activeAllelesToGenotype.isEmpty() ) { return NO_CALLS; }
+ if ( activeAllelesToGenotype.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); }
} else {
- if( originalActiveRegion.size() == 0 ) { return NO_CALLS; } // No reads here so nothing to do!
+ // No reads here so nothing to do!
+ if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); }
}
// run the local assembler, getting back a collection of information on how we should proceed
final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
// abort early if something is out of the acceptable range
- if( ! assemblyResult.isVariationPresent() ) { return NO_CALLS; } // only the reference haplotype remains so nothing else to do!
+ if( ! assemblyResult.isVariationPresent() ) {
+ return referenceModelForNoVariation(originalActiveRegion, false);
+ } // only the reference haplotype remains so nothing else to do!
+
if (dontGenotype) return NO_CALLS; // user requested we not proceed
// filter out reads from genotyping which fail mapping quality based criteria
final Collection filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads );
- if( assemblyResult.regionForGenotyping.size() == 0 ) { return NO_CALLS; } // no reads remain after filtering so nothing else to do!
+ if( assemblyResult.regionForGenotyping.size() == 0 ) {
+ // no reads remain after filtering so nothing else to do!
+ return referenceModelForNoVariation(originalActiveRegion, false);
+ }
// evaluate each sample's reads against all haplotypes
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
@@ -738,7 +792,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
if ( bamWriter != null ) {
- haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
+ haplotypeBAMWriter.writeReadsAlignedToHaplotypes(
+ assemblyResult.haplotypes,
+ assemblyResult.paddedReferenceLoc,
assemblyResult.haplotypes,
calledHaplotypes.getCalledHaplotypes(),
stratifiedReadMap);
@@ -746,7 +802,13 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
- return calledHaplotypes.getCalls();
+ if ( emitReferenceConfidence() ) {
+ return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getRefHaplotype(),
+ calledHaplotypes.getCalledHaplotypes(), assemblyResult.paddedReferenceLoc, assemblyResult.regionForGenotyping,
+ stratifiedReadMap, calledHaplotypes.getCalls());
+ } else {
+ return calledHaplotypes.getCalls();
+ }
}
private final static class AssemblyResult {
@@ -755,6 +817,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
final byte[] fullReferenceWithPadding;
final GenomeLoc paddedReferenceLoc;
final boolean variationPresent;
+ final Haplotype refHaplotype;
private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) {
this.haplotypes = haplotypes;
@@ -762,6 +825,21 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
this.fullReferenceWithPadding = fullReferenceWithPadding;
this.paddedReferenceLoc = paddedReferenceLoc;
this.variationPresent = variationPresent;
+
+ Haplotype firstRefHaplotype = null;
+ for ( final Haplotype h : haplotypes ) {
+ if ( h.isReference() ) {
+ if ( firstRefHaplotype != null ) throw new IllegalArgumentException("Found two haplotypes marked as reference " + firstRefHaplotype + " and " + h);
+ firstRefHaplotype = h;
+ }
+ }
+
+ if ( firstRefHaplotype == null ) throw new IllegalArgumentException("Couldn't find a reference haplotype in " + haplotypes);
+ this.refHaplotype = firstRefHaplotype;
+ }
+
+ public Haplotype getRefHaplotype() {
+ return refHaplotype;
}
public boolean isVariationPresent() {
@@ -780,7 +858,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
*/
protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) {
// Create the reference haplotype which is the bases from the reference that make up the active region
- finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails
+ finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
@@ -793,7 +871,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
try {
final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector );
- if ( ! dontTrimActiveRegions ) {
+ if ( ! emitReferenceConfidence() && ! dontTrimActiveRegions ) {
return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc);
} else {
// we don't want to trim active regions, so go ahead and use the old one
@@ -819,12 +897,54 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
* @return a non-null haplotype
*/
private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) {
- final Haplotype refHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true);
- refHaplotype.setAlignmentStartHapwrtRef(activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart());
- final Cigar c = new Cigar();
- c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
- refHaplotype.setCigar(c);
- return refHaplotype;
+ return ReferenceConfidenceModel.createReferenceHaplotype(activeRegion, activeRegion.getActiveRegionReference(referenceReader), paddedReferenceLoc);
+ }
+
+ /**
+ * Create an ref model result (ref model or no calls depending on mode) for an active region without any variation
+ * (not is active, or assembled to just ref)
+ *
+ * @param region the region to return a no-variation result
+ * @param needsToBeFinalized should the region be finalized before computing the ref model (should be false if already done)
+ * @return a list of variant contexts (can be empty) to emit for this ref region
+ */
+ private List referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) {
+ if ( emitReferenceConfidence() ) {
+ if ( needsToBeFinalized ) finalizeActiveRegion(region);
+ filterNonPassingReads(region); // TODO -- remove when filtering is done in finalizeActiveRegion
+ final GenomeLoc paddedLoc = region.getExtendedLoc();
+ final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc);
+ final List haplotypes = Collections.singletonList(refHaplotype);
+ return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes,
+ paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region),
+ Collections.emptyList());
+ } else {
+ return NO_CALLS;
+ }
+ }
+
+ /**
+ * Create a context that maps each read to the reference haplotype with log10 L of 0
+ * @param refHaplotype a non-null reference haplotype
+ * @param samples a list of all samples
+ * @param region the active region containing reads
+ * @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref
+ */
+ public static Map createDummyStratifiedReadMap(final Haplotype refHaplotype,
+ final List samples,
+ final ActiveRegion region) {
+ final Allele refAllele = Allele.create(refHaplotype, true);
+
+ final Map map = new LinkedHashMap<>(1);
+ for ( final Map.Entry> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) {
+ final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
+ for ( final GATKSAMRecord read : entry.getValue() ) {
+ likelihoodMap.add(read, refAllele, 0.0);
+ }
+ map.put(entry.getKey(), likelihoodMap);
+ }
+
+ return map;
}
/**
@@ -917,6 +1037,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
@Override
public void onTraversalDone(Integer result) {
+ if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
+ referenceConfidenceModel.close();
likelihoodCalculationEngine.close();
logger.info("Ran local assembly on " + result + " active regions");
}
@@ -933,28 +1055,28 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// Loop through the reads hard clipping the adaptor and low quality tails
final List readsToUse = new ArrayList<>(activeRegion.getReads().size());
for( final GATKSAMRecord myRead : activeRegion.getReads() ) {
- final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
- if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
- GATKSAMRecord clippedRead;
- if (errorCorrectReads)
- clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION );
- else if (useLowQualityBasesForAssembly)
- clippedRead = postAdapterRead;
- else // default case: clip low qual ends of reads
- clippedRead= ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
+ GATKSAMRecord clippedRead;
+ if (errorCorrectReads)
+ clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION );
+ else if (useLowQualityBasesForAssembly)
+ clippedRead = myRead;
+ else // default case: clip low qual ends of reads
+ clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY );
- if ( dontUseSoftClippedBases ) {
- // uncomment to remove hard clips from consideration at all
- clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
- } else {
- // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
- // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
- // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
- // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
- // TODO -- reference haplotype start must be removed
- clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
- }
+ if ( dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(clippedRead) ) {
+ // remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all
+ clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
+ } else {
+ // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
+ // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
+ // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
+ // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
+ // TODO -- reference haplotype start must be removed
+ clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
+ }
+ clippedRead = ( clippedRead.getReadUnmappedFlag() ? clippedRead : ReadClipper.hardClipAdaptorSequence( clippedRead ) );
+ if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) {
clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() );
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());
@@ -963,8 +1085,15 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
}
}
+ // TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code.
+
+ final List downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
+
+ // handle overlapping read pairs from the same fragment
+ cleanOverlappingReadPairs(downsampledReads);
+
activeRegion.clearReads();
- activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
+ activeRegion.addAll(downsampledReads);
}
private Set filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
@@ -985,6 +1114,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
}
private Map> splitReadsBySample( final Collection reads ) {
+ return splitReadsBySample(samplesList, reads);
+ }
+
+ public static Map> splitReadsBySample( final List samplesList, final Collection reads ) {
final Map> returnMap = new HashMap<>();
for( final String sample : samplesList) {
List readList = returnMap.get( sample );
@@ -1000,5 +1133,24 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
return returnMap;
}
+ /**
+ * Are we emitting a reference confidence in some form, or not?
+ * @return true if we are
+ */
+ private boolean emitReferenceConfidence(){
+ return emitReferenceConfidence != ReferenceConfidenceMode.NONE;
+ }
+ /**
+ * Clean up reads/bases that overlap within read pairs
+ *
+ * @param reads the list of reads to consider
+ */
+ private void cleanOverlappingReadPairs(final List reads) {
+ for ( final List perSampleReadList : splitReadsBySample(reads).values() ) {
+ final FragmentCollection fragmentCollection = FragmentUtils.create(perSampleReadList);
+ for ( final List overlappingPair : fragmentCollection.getOverlappingPairs() )
+ FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair);
+ }
+ }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
index c889d7995..165fb71db 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
@@ -51,17 +51,12 @@ import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
-import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
-import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.sting.utils.sam.ReadUtils;
-import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
-import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
@@ -82,7 +77,7 @@ public abstract class LocalAssemblyEngine {
* If false, we will only write out a region around the reference source
*/
private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true;
- public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
+ public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 10;
private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30;
protected final int numBestHaplotypesPerGraph;
@@ -115,9 +110,9 @@ public abstract class LocalAssemblyEngine {
* @param refHaplotype the reference haplotype
* @return a non-null list of reads
*/
- protected abstract List assemble(List reads, Haplotype refHaplotype, List activeAlleleHaplotypes);
+ protected abstract List assemble(List reads, Haplotype refHaplotype, List activeAlleleHaplotypes);
- protected List assemble(List reads, Haplotype refHaplotype) {
+ protected List assemble(List reads, Haplotype refHaplotype) {
return assemble(reads, refHaplotype, Collections.emptyList());
}
@@ -145,7 +140,6 @@ public abstract class LocalAssemblyEngine {
// create the list of artificial haplotypes that should be added to the graph for GGA mode
final List activeAlleleHaplotypes = createActiveAlleleHaplotypes(refHaplotype, activeAllelesToGenotype, activeRegion.getExtendedLoc());
-
// error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them
final List correctedReads;
if (readErrorCorrector != null) {
@@ -154,20 +148,31 @@ public abstract class LocalAssemblyEngine {
// and we only want the read-error corrected reads for graph building.
readErrorCorrector.addReadsToKmers(activeRegion.getReads());
correctedReads = new ArrayList<>(readErrorCorrector.correctReads(activeRegion.getReads()));
+ } else {
+ correctedReads = activeRegion.getReads();
}
- else correctedReads = activeRegion.getReads();
+ final List nonRefGraphs = new LinkedList<>();
// create the graphs by calling our subclass assemble method
- final List graphs = assemble(correctedReads, refHaplotype, activeAlleleHaplotypes);
-
- // do some QC on the graphs
- for ( final SeqGraph graph : graphs ) { sanityCheckGraph(graph, refHaplotype); }
+ for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, activeAlleleHaplotypes) ) {
+ if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) {
+ // do some QC on the graph
+ sanityCheckGraph(result.getGraph(), refHaplotype);
+ // add it to graphs with meaningful non-reference features
+ nonRefGraphs.add(result.getGraph());
+ }
+ }
// print the graphs if the appropriate debug option has been turned on
- if ( graphWriter != null ) { printGraphs(graphs); }
+ if ( graphWriter != null ) { printGraphs(nonRefGraphs); }
- // find the best paths in the graphs and return them as haplotypes
- return findBestPaths( graphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() );
+ if ( nonRefGraphs.isEmpty() ) {
+ // we couldn't assemble any meaningful graphs, so return just the reference haplotype
+ return Collections.singletonList(refHaplotype);
+ } else {
+ // find the best paths in the graphs and return them as haplotypes
+ return findBestPaths( nonRefGraphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() );
+ }
}
/**
@@ -288,7 +293,7 @@ public abstract class LocalAssemblyEngine {
}
}
- protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
+ protected AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) {
printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot"));
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
@@ -296,9 +301,7 @@ public abstract class LocalAssemblyEngine {
printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot"));
// now go through and prune the graph, removing vertices no longer connected to the reference chain
- // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
- // edges to maintain graph connectivity.
- seqGraph.pruneGraph(pruneFactor);
+ seqGraph.removeSingletonOrphanVertices();
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot"));
@@ -309,7 +312,7 @@ public abstract class LocalAssemblyEngine {
// happen in cases where for example the reference somehow manages to acquire a cycle, or
// where the entire assembly collapses back into the reference sequence.
if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
- return null;
+ return new AssemblyResult(AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE, seqGraph);
seqGraph.removePathsNotConnectedToRef();
seqGraph.simplifyGraph();
@@ -324,7 +327,7 @@ public abstract class LocalAssemblyEngine {
}
printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot"));
- return seqGraph;
+ return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph);
}
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java
new file mode 100644
index 000000000..ee7565282
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java
@@ -0,0 +1,72 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+/**
+ * Holds information about a genotype call of a single sample reference vs. any non-ref event
+ *
+ * User: depristo
+ * Date: 6/21/13
+ * Time: 12:58 PM
+ * To change this template use File | Settings | File Templates.
+ */
+final class RefVsAnyResult {
+ /**
+ * The genotype likelihoods for ref/ref ref/non-ref non-ref/non-ref
+ */
+ final double[] genotypeLikelihoods = new double[3];
+
+ /**
+ * AD field value for ref / non-ref
+ */
+ final int[] AD_Ref_Any = new int[2];
+
+ /**
+ * @return Get the DP (sum of AD values)
+ */
+ public int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java
new file mode 100644
index 000000000..98264d4c2
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java
@@ -0,0 +1,476 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+import net.sf.samtools.*;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
+import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
+import org.broadinstitute.sting.utils.haplotypeBAMWriter.ReadDestination;
+import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.variant.variantcontext.*;
+import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
+import org.broadinstitute.variant.vcf.VCFHeaderLine;
+import org.broadinstitute.variant.vcf.VCFHeaderLineType;
+import org.broadinstitute.variant.vcf.VCFSimpleHeaderLine;
+
+import java.io.File;
+import java.util.*;
+
+/**
+ * Code for estimating the reference confidence
+ *
+ * This code can estimate the probability that the data for a single sample is consistent with a
+ * well-determined REF/REF diploid genotype.
+ *
+ * User: depristo
+ * Date: 6/21/13
+ * Time: 12:52 PM
+ */
+public class ReferenceConfidenceModel {
+ public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF";
+ public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site
+
+ public final static String INDEL_INFORMATIVE_DEPTH = "CD";
+
+ private final GenomeLocParser genomeLocParser;
+ private final Set samples;
+ private final SAMFileHeader header; // TODO -- really shouldn't depend on this
+ private final int indelInformativeDepthIndelSize;
+
+ private final static boolean WRITE_DEBUGGING_BAM = false;
+ private final SAMFileWriter debuggingWriter;
+
+ /**
+ * Create a new ReferenceConfidenceModel
+ *
+ * @param genomeLocParser how we create genome locs
+ * @param samples the list of all samples we'll be considering with this model
+ * @param header the SAMFileHeader describing the read information (used for debugging)
+ * @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths
+ */
+ public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser,
+ final Set samples,
+ final SAMFileHeader header,
+ final int indelInformativeDepthIndelSize) {
+ if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
+ if ( samples == null ) throw new IllegalArgumentException("samples cannot be null");
+ if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty");
+ if ( header == null ) throw new IllegalArgumentException("header cannot be empty");
+ if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize);
+
+ this.genomeLocParser = genomeLocParser;
+ this.samples = samples;
+ this.header = header;
+ this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize;
+
+ if ( WRITE_DEBUGGING_BAM ) {
+ final SAMFileWriterFactory factory = new SAMFileWriterFactory();
+ factory.setCreateIndex(true);
+ debuggingWriter = factory.makeBAMWriter(header, false, new File("refCalc.bam"));
+ } else {
+ debuggingWriter = null;
+ }
+ }
+
+ /**
+ * Get the VCF header lines to include when emitting reference confidence values via calculateRefConfidence
+ * @return a non-null set of VCFHeaderLines
+ */
+ public Set getVCFHeaderLines() {
+ final Set headerLines = new LinkedHashSet<>();
+ headerLines.add(new VCFSimpleHeaderLine("ALT", NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
+ headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize));
+ return headerLines;
+ }
+
+ /**
+ * Close down this reference model, closing down any debugging information opened during execution
+ */
+ public void close() {
+ if ( debuggingWriter != null ) debuggingWriter.close();
+ }
+
+
+ /**
+ * Calculate the reference confidence for a single sample given the its read data
+ *
+ * Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing
+ * detailed information about the certainty that the sample is hom-ref for each base in the region.
+ *
+ *
+ *
+ * @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc()
+ * @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the
+ * stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype.
+ * @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc())
+ * @param activeRegion the active region we want to get the reference confidence over
+ * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes
+ * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the
+ * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence
+ * under any position is covers (for snps that 1 bp, but for deletion its the entire ref span)
+ * @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence
+ * contexts as well as calls from variantCalls if any were provided
+ */
+ public List calculateRefConfidence(final Haplotype refHaplotype,
+ final Collection calledHaplotypes,
+ final GenomeLoc paddedReferenceLoc,
+ final ActiveRegion activeRegion,
+ final Map stratifiedReadMap,
+ final List variantCalls) {
+ if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null");
+ if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null");
+ if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype");
+ if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null");
+ if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null");
+ if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null");
+ if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size());
+ if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different");
+
+ final GenomeLoc refSpan = activeRegion.getLocation();
+ final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, refSpan, stratifiedReadMap);
+ final byte[] ref = refHaplotype.getBases();
+ final List results = new ArrayList<>(refSpan.size());
+ final String sampleName = stratifiedReadMap.keySet().iterator().next();
+
+ final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart();
+ for ( final ReadBackedPileup pileup : refPileups ) {
+ final GenomeLoc curPos = pileup.getLocation();
+ final int offset = curPos.getStart() - refSpan.getStart();
+
+ final VariantContext overlappingSite = getOverlappingVariantContext(curPos, variantCalls);
+ if ( overlappingSite != null ) {
+ // we have some overlapping site, add it to the list of positions
+ if ( overlappingSite.getStart() == curPos.getStart() )
+ results.add(overlappingSite);
+ } else {
+ // otherwise emit a reference confidence variant context
+ final int refOffset = offset + globalRefOffset;
+ final byte refBase = ref[refOffset];
+ final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null);
+
+ final Allele refAllele = Allele.create(refBase, true);
+ final List refSiteAlleles = Arrays.asList(refAllele, NON_REF_SYMBOLIC_ALLELE);
+ final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles);
+ final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele));
+ gb.AD(homRefCalc.AD_Ref_Any);
+ gb.DP(homRefCalc.getDP());
+
+ // genotype likelihood calculation
+ final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods);
+ final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize);
+ final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads);
+
+ // now that we have the SNP and indel GLs, we take the one with the least confidence,
+ // as this is the most conservative estimate of our certainty that we are hom-ref.
+ // For example, if the SNP PLs are 0,10,100 and the indel PLs are 0,100,1000
+ // we are very certain that there's no indel here, but the SNP confidence imply that we are
+ // far less confident that the ref base is actually the only thing here. So we take 0,10,100
+ // as our GLs for the site.
+ final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs);
+
+ gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF)));
+ gb.PL(leastConfidenceGLs.getAsPLs());
+ gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads);
+
+ vcb.genotypes(gb.make());
+ results.add(vcb.make());
+// logger.info(" => VariantContext " + vcb.make());
+ }
+ }
+
+ return results;
+ }
+
+ /**
+ * Get the GenotypeLikelihoods with the least strong corresponding GQ value
+ * @param gl1 first to consider (cannot be null)
+ * @param gl2 second to consider (cannot be null)
+ * @return gl1 or gl2, whichever has the worst GQ
+ */
+ protected final GenotypeLikelihoods getGLwithWorstGQ(final GenotypeLikelihoods gl1, final GenotypeLikelihoods gl2) {
+ return gl1.getLog10GQ(GenotypeType.HOM_REF) > gl2.getLog10GQ(GenotypeType.HOM_REF) ? gl1 : gl2;
+ }
+
+ /**
+ * Get indel PLs corresponding to seeing N nIndelInformativeReads at this site
+ *
+ * @param nInformativeReads the number of reads that inform us about being ref without an indel at this site
+ * @return non-null GenotypeLikelihoods given N
+ */
+ protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) {
+ // TODO -- optimization -- this could easily be optimized with some caching
+ final double homRef = 0.0;
+ final double het = - LOG10_2 * nInformativeReads;
+ final double homVar = INDEL_ERROR_RATE * nInformativeReads;
+ return GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar});
+ }
+ private final static double LOG10_2 = Math.log10(2);
+ private final static double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
+
+ /**
+ * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
+ *
+ * @param pileup the read backed pileup containing the data we want to evaluate
+ * @param refBase the reference base at this pileup position
+ * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation
+ * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
+ * @return a RefVsAnyResult genotype call
+ */
+ public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) {
+ final RefVsAnyResult result = new RefVsAnyResult();
+
+ for( final PileupElement p : pileup ) {
+ final byte qual = p.getQual();
+ if( p.isDeletion() || qual > minBaseQual) {
+ int AA = 0; final int AB = 1; int BB = 2;
+ if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
+ AA = 2;
+ BB = 0;
+ if( hqSoftClips != null && p.isNextToSoftClip() ) {
+ hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
+ }
+ result.AD_Ref_Any[1]++;
+ } else {
+ result.AD_Ref_Any[0]++;
+ }
+ result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
+ result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
+ result.genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ * Get a list of pileups that span the entire active region span, in order, one for each position
+ */
+ private List getPileupsOverReference(final Haplotype refHaplotype,
+ final Collection calledHaplotypes,
+ final GenomeLoc paddedReferenceLoc,
+ final GenomeLoc activeRegionSpan,
+ final Map stratifiedReadMap) {
+ final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO");
+ final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest);
+ writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves
+ writer.writeReadsAlignedToHaplotypes(calledHaplotypes.isEmpty() ? Collections.singleton(refHaplotype) : calledHaplotypes, paddedReferenceLoc, stratifiedReadMap);
+ final List realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads());
+
+ if ( debuggingWriter != null )
+ for ( final GATKSAMRecord read : realignedReads )
+ debuggingWriter.addAlignment(read);
+
+ final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING,
+ false, genomeLocParser, samples, false);
+
+ final List pileups = new LinkedList<>();
+ final int startPos = activeRegionSpan.getStart();
+ AlignmentContext next = libs.advanceToLocus(startPos, true);
+ for ( int curPos = startPos; curPos <= activeRegionSpan.getStop(); curPos++ ) {
+ if ( next != null && next.getLocation().getStart() == curPos ) {
+ pileups.add(next.getBasePileup());
+ next = libs.hasNext() ? libs.next() : null;
+ } else {
+ // no data, so we create empty pileups
+ pileups.add(new ReadBackedPileupImpl(genomeLocParser.createGenomeLoc(activeRegionSpan.getContig(), curPos)));
+ }
+ }
+
+ return pileups;
+ }
+
+ /**
+ * Return the rightmost variant context in maybeOverlapping that overlaps curPos
+ *
+ * @param curPos non-null genome loc
+ * @param maybeOverlapping a collection of variant contexts that might overlap curPos
+ * @return a VariantContext, or null if none overlaps
+ */
+ protected final VariantContext getOverlappingVariantContext(final GenomeLoc curPos, final Collection maybeOverlapping) {
+ VariantContext overlaps = null;
+ for ( final VariantContext vc : maybeOverlapping ) {
+ if ( genomeLocParser.createGenomeLoc(vc).overlapsP(curPos) ) {
+ if ( overlaps == null || vc.getStart() > overlaps.getStart() ) {
+ overlaps = vc;
+ }
+ }
+ }
+ return overlaps;
+ }
+
+ /**
+ * Compute the sum of mismatching base qualities for readBases aligned to refBases at readStart / refStart
+ * assuming no insertions or deletions in the read w.r.t. the reference
+ *
+ * @param readBases non-null bases of the read
+ * @param readQuals non-null quals of the read
+ * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference)
+ * @param refBases the reference bases
+ * @param refStart the offset into refBases that aligns to the readStart position in readBases
+ * @param maxSum if the sum goes over this value, return immediately
+ * @return the sum of quality scores for readBases that mismatch their corresponding ref bases
+ */
+ protected final int sumMismatchingQualities(final byte[] readBases,
+ final byte[] readQuals,
+ final int readStart,
+ final byte[] refBases,
+ final int refStart,
+ final int maxSum) {
+ final int n = Math.min(readBases.length - readStart, refBases.length - refStart);
+ int sum = 0;
+
+ for ( int i = 0; i < n; i++ ) {
+ final byte readBase = readBases[readStart + i];
+ final byte refBase = refBases[refStart + i];
+ if ( readBase != refBase ) {
+ sum += readQuals[readStart + i];
+ if ( sum > maxSum )
+ return sum;
+ }
+ }
+
+ return sum;
+ }
+
+ /**
+ * Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart
+ *
+ * @param readBases non-null bases of the read
+ * @param readQuals non-null quals of the read
+ * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference)
+ * @param refBases the reference bases
+ * @param refStart the offset into refBases that aligns to the readStart position in readBases
+ * @param maxIndelSize the max indel size to consider for the read to be informative
+ * @return true if read can eliminate the possibility that there's an indel of size <= maxIndelSize segregating at refStart
+ */
+ protected boolean isReadInformativeAboutIndelsOfSize(final byte[] readBases,
+ final byte[] readQuals,
+ final int readStart,
+ final byte[] refBases,
+ final int refStart,
+ final int maxIndelSize) {
+ // todo -- fast exit when n bases left < maxIndelSize
+
+ final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE);
+
+ // consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion
+ // or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores
+ for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) {
+ for ( final boolean checkInsertion : Arrays.asList(true, false) ) {
+ final int readI, refI;
+ if ( checkInsertion ) {
+ readI = readStart + indelSize;
+ refI = refStart;
+ } else {
+ readI = readStart;
+ refI = refStart + indelSize;
+ }
+
+ final int score = sumMismatchingQualities(readBases, readQuals, readI, refBases, refI, baselineMMSum);
+ if ( score <= baselineMMSum )
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Calculate the number of indel informative reads at pileup
+ *
+ * @param pileup a pileup
+ * @param pileupOffsetIntoRef the position of the pileup in the reference
+ * @param ref the ref bases
+ * @param maxIndelSize maximum indel size to consider in the informativeness calculation
+ * @return an integer >= 0
+ */
+ protected final int calcNIndelInformativeReads(final ReadBackedPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) {
+ int nInformative = 0;
+ for ( final PileupElement p : pileup ) {
+ final GATKSAMRecord read = p.getRead();
+ final int offset = p.getOffset();
+
+ // doesn't count as evidence
+ if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() )
+ continue;
+
+ // todo -- this code really should handle CIGARs directly instead of relying on the above tests
+ if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize))
+ nInformative++;
+ }
+ return nInformative;
+ }
+
+ /**
+ * Create a reference haplotype for an active region
+ *
+ * @param activeRegion the active region
+ * @param refBases the ref bases
+ * @param paddedReferenceLoc the location spanning of the refBases -- can be longer than activeRegion.getLocation()
+ * @return a reference haplotype
+ */
+ public static Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final byte[] refBases, final GenomeLoc paddedReferenceLoc) {
+ final Haplotype refHaplotype = new Haplotype(refBases, true);
+ final int alignmentStart = activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart();
+ if ( alignmentStart < 0 ) throw new IllegalStateException("Bad alignment start in createReferenceHaplotype " + alignmentStart);
+ refHaplotype.setAlignmentStartHapwrtRef(alignmentStart);
+ final Cigar c = new Cigar();
+ c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
+ refHaplotype.setCigar(c);
+ return refHaplotype;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
index 2b37d90c2..bd179ef41 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
@@ -71,7 +71,7 @@ public class BaseGraph extends Default
private final int kmerSize;
/**
- * Construct a DeBruijnGraph with kmerSize
+ * Construct a TestGraph with kmerSize
* @param kmerSize
*/
public BaseGraph(final int kmerSize, final EdgeFactory edgeFactory) {
@@ -472,28 +472,11 @@ public class BaseGraph extends Default
}
/**
- * Prune all edges from this graph that have multiplicity <= pruneFactor and remove all orphaned singleton vertices as well
- *
- * @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
- */
- public void pruneGraph( final int pruneFactor ) {
- final List edgesToRemove = new ArrayList<>();
- for( final E e : edgeSet() ) {
- if( e.getPruningMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
- edgesToRemove.add(e);
- }
- }
- removeAllEdges(edgesToRemove);
-
- removeSingletonOrphanVertices();
- }
-
- /**
- * Prune all chains from this graph where all edges in the path have multiplicity <= pruneFactor
+ * Prune all chains from this graph where any edge in the path has multiplicity < pruneFactor
*
* @see LowWeightChainPruner for more information
*
- * @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
+ * @param pruneFactor all edges with multiplicity < this factor that aren't ref edges will be removed
*/
public void pruneLowWeightChains( final int pruneFactor ) {
final LowWeightChainPruner pruner = new LowWeightChainPruner<>(pruneFactor);
@@ -503,7 +486,7 @@ public class BaseGraph extends Default
/**
* Remove all vertices in the graph that have in and out degree of 0
*/
- protected void removeSingletonOrphanVertices() {
+ public void removeSingletonOrphanVertices() {
// Run through the graph and clean up singular orphaned nodes
final List verticesToRemove = new LinkedList<>();
for( final V v : vertexSet() ) {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java
index 27b6bd902..520267dee 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java
@@ -96,7 +96,7 @@ public class LowWeightChainPruner {
}
/**
- * Traverse the edges in the path and determine if any are either ref edges or have weight above
+ * Traverse the edges in the path and determine if any are either ref edges or have weight above or equal to
* the pruning factor and should therefore not be pruned away.
*
* @param path the path in question
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java
similarity index 89%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java
index 0200ce4a2..8c79a5efe 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java
@@ -49,17 +49,16 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import com.google.java.contract.Ensures;
import org.jgrapht.EdgeFactory;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
- * A DeBruijn kmer graph
+ * A Test kmer graph
*
* User: rpoplin
* Date: 2/6/13
*/
-public final class DeBruijnGraph extends BaseGraph {
+public final class TestGraph extends BaseGraph {
/**
* Edge factory that creates non-reference multiplicity 1 edges
*/
@@ -71,33 +70,20 @@ public final class DeBruijnGraph extends BaseGraph {
}
/**
- * Create an empty DeBruijnGraph with default kmer size
+ * Create an empty TestGraph with default kmer size
*/
- public DeBruijnGraph() {
+ public TestGraph() {
this(11);
}
/**
- * Create an empty DeBruijnGraph with kmer size
+ * Create an empty TestGraph with kmer size
* @param kmerSize kmer size, must be >= 1
*/
- public DeBruijnGraph(int kmerSize) {
+ public TestGraph(int kmerSize) {
super(kmerSize, new MyEdgeFactory());
}
- /**
- * Pull kmers out of the given long sequence and throw them on in the graph
- * @param sequence byte array holding the sequence with which to build the assembly graph
- * @param KMER_LENGTH the desired kmer length to use
- * @param isRef if true the kmers added to the graph will have reference edges linking them
- */
- public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) {
- if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); }
- final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
- for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
- addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1);
- }
- }
/**
* Add edge to assembly graph connecting the two kmers
@@ -129,7 +115,7 @@ public final class DeBruijnGraph extends BaseGraph {
@Ensures({"result != null"})
public SeqGraph convertToSequenceGraph() {
final SeqGraph seqGraph = new SeqGraph(getKmerSize());
- final Map vertexMap = new HashMap();
+ final Map vertexMap = new HashMap<>();
// create all of the equivalent seq graph vertices
for ( final DeBruijnVertex dv : vertexSet() ) {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
index 672c61c0f..6500196d3 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
@@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResult;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.MathUtils;
@@ -98,32 +99,33 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
this.justReturnRawGraph = justReturnRawGraph;
}
+ private void addResult(final List results, final AssemblyResult maybeNullResult) {
+ if ( maybeNullResult != null )
+ results.add(maybeNullResult);
+ }
+
@Override
- public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) {
- final List graphs = new LinkedList<>();
+ public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) {
+ final List results = new LinkedList<>();
// first, try using the requested kmer sizes
for ( final int kmerSize : kmerSizes ) {
- final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles);
- if ( graph != null )
- graphs.add(graph);
+ addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles));
}
// if none of those worked, iterate over larger sizes if allowed to do so
- if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) {
+ if ( results.isEmpty() && !dontIncreaseKmerSizesForCycles ) {
int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE;
int numIterations = 1;
- while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
+ while ( results.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
// on the last attempt we will allow low complexity graphs
- final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT);
- if ( graph != null )
- graphs.add(graph);
+ addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT));
kmerSize += KMER_SIZE_ITERATION_INCREASE;
numIterations++;
}
}
- return graphs;
+ return results;
}
/**
@@ -136,11 +138,16 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
* @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
* @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
*/
- protected SeqGraph createGraph(final List reads,
- final Haplotype refHaplotype,
- final int kmerSize,
- final List activeAlleleHaplotypes,
- final boolean allowLowComplexityGraphs) {
+ protected AssemblyResult createGraph(final List reads,
+ final Haplotype refHaplotype,
+ final int kmerSize,
+ final List activeAlleleHaplotypes,
+ final boolean allowLowComplexityGraphs) {
+ if ( refHaplotype.length() < kmerSize ) {
+ // happens in cases where the assembled region is just too small
+ return new AssemblyResult(AssemblyResult.Status.FAILED, null);
+ }
+
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples);
// add the reference sequence to the graph
@@ -183,7 +190,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
// we can recover them by merging some N bases from the chain back into the reference
- if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
+ if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(pruneFactor);
// remove all heading and trailing paths
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
@@ -193,14 +200,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
- if ( justReturnRawGraph ) return initialSeqGraph;
+ if ( justReturnRawGraph ) return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph);
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
- final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
- return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph;
+ final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
+ final AssemblyResult.Status status = cleaned.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(cleaned.getGraph()) ? AssemblyResult.Status.FAILED : cleaned.getStatus();
+ return new AssemblyResult(status, cleaned.getGraph());
}
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
index 7d7df2c06..47d14e185 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java
@@ -58,6 +58,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
+import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman;
import org.jgrapht.EdgeFactory;
import org.jgrapht.alg.CycleDetector;
@@ -93,6 +94,9 @@ public class ReadThreadingGraph extends BaseGraph 0");
// generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
- final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex);
+ final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex, pruneFactor);
// if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path
if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) )
@@ -301,13 +306,14 @@ public class ReadThreadingGraph extends BaseGraph elements = cigar.getCigarElements();
+ final int numElements = elements.size();
// don't allow more than a couple of different ops
- if ( elements.size() > 3 )
+ if ( numElements > MAX_CIGAR_COMPLEXITY )
return false;
// the last element must be an M
- if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M )
+ if ( elements.get(numElements - 1).getOperator() != CigarOperator.M )
return false;
// TODO -- do we want to check whether the Ms mismatch too much also?
@@ -334,7 +340,8 @@ public class ReadThreadingGraph extends BaseGraph altPath = findPathToLowestCommonAncestorOfReference(vertex);
- if ( altPath == null || isRefSource(altPath.get(0)) )
+ final List altPath = findPathToLowestCommonAncestorOfReference(vertex, pruneFactor);
+ if ( altPath == null || isRefSource(altPath.get(0)) || altPath.size() < MIN_DANGLING_TAIL_LENGTH )
return null;
// now get the reference path from the LCA
@@ -361,24 +369,32 @@ public class ReadThreadingGraph extends BaseGraph findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) {
+ protected List findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) {
final LinkedList path = new LinkedList<>();
MultiDeBruijnVertex v = vertex;
while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) {
- path.addFirst(v);
- v = getEdgeSource(incomingEdgeOf(v));
+ final MultiSampleEdge edge = incomingEdgeOf(v);
+ // if it has too low a weight, don't use it (or previous vertexes) for the path
+ if ( edge.getPruningMultiplicity() < pruneFactor )
+ path.clear();
+ // otherwise it is safe to use
+ else
+ path.addFirst(v);
+ v = getEdgeSource(edge);
}
path.addFirst(v);
@@ -453,7 +469,12 @@ public class ReadThreadingGraph extends BaseGraph uniqueKmers.size();
}
- public void recoverDanglingTails() {
+ /**
+ * Try to recover dangling tails
+ *
+ * @param pruneFactor the prune factor to use in ignoring chain pieces
+ */
+ public void recoverDanglingTails(final int pruneFactor) {
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
int attempted = 0;
@@ -461,7 +482,7 @@ public class ReadThreadingGraph extends BaseGraph= kmerSize ) {
// if the sequence is long enough to get some value out of, add it to the graph
final String name = read.getReadName() + "_" + start + "_" + end;
- addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, stop, reducedReadCounts, false);
+ addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, reducedReadCounts, false);
}
lastGood = -1; // reset the last good base
diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java
new file mode 100644
index 000000000..8f509b36b
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java
@@ -0,0 +1,298 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.utils.gvcf;
+
+import org.broadinstitute.variant.variantcontext.Genotype;
+import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
+import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
+import org.broadinstitute.variant.vcf.*;
+
+import java.util.*;
+
+/**
+ * Genome-wide VCF writer
+ *
+ * User: depristo
+ * Date: 6/24/13
+ * Time: 2:51 PM
+ */
+public class GVCFWriter implements VariantContextWriter {
+ //
+ // static VCF field names
+ //
+ protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE";
+ protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP";
+ protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ";
+
+ //
+ // Final fields initialized in constructor
+ //
+ /** Where we'll ultimately write our VCF records */
+ final private VariantContextWriter underlyingWriter;
+
+ final private List GQPartitions;
+
+ /** fields updated on the fly during GVCFWriter operation */
+ int nextAvailableStart = -1;
+ private String sampleName = null;
+ private HomRefBlock currentBlock = null;
+
+ /**
+ * Is the proposed GQ partitions well-formed?
+ *
+ * @param GQPartitions proposed GQ partitions
+ * @return a non-null string if something is wrong (string explains issue)
+ */
+ protected static List parsePartitions(final List GQPartitions) {
+ if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null");
+ if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty");
+
+ final List result = new LinkedList<>();
+ int lastThreshold = 0;
+ for ( final Integer value : GQPartitions ) {
+ if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer");
+ if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value);
+ if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value);
+ result.add(new HomRefBlock(lastThreshold, value));
+ lastThreshold = value;
+ }
+ result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE));
+
+ return result;
+ }
+
+ /**
+ * Create a new GVCF writer
+ *
+ * Should be a non-empty list of boundaries. For example, suppose this variable is
+ *
+ * [A, B, C]
+ *
+ * We would partition our hom-ref sites into the following bands:
+ *
+ * X < A
+ * A <= X < B
+ * B <= X < C
+ * X >= C
+ *
+ * @param underlyingWriter the ultimate destination of the GVCF records
+ * @param GQPartitions a well-formed list of GQ partitions
+ */
+ public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions) {
+ if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null");
+ this.underlyingWriter = underlyingWriter;
+ this.GQPartitions = parsePartitions(GQPartitions);
+ }
+
+ /**
+ * Write the VCF header
+ *
+ * Adds standard GVCF fields to the header
+ *
+ * @param header a non-null header
+ */
+ @Override
+ public void writeHeader(VCFHeader header) {
+ if ( header == null ) throw new IllegalArgumentException("header cannot be null");
+ header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
+ header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block"));
+ header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block"));
+ header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block"));
+
+ for ( final HomRefBlock partition : GQPartitions ) {
+ header.addMetaDataLine(partition.toVCFHeaderLine());
+ }
+
+ underlyingWriter.writeHeader(header);
+ }
+
+ /**
+ * Close this GVCF writer. Finalizes any pending hom-ref blocks and emits those to the underlyingWriter as well
+ */
+ @Override
+ public void close() {
+ close(true);
+ }
+
+ /**
+ * Horrible work around because there's no clean way to get our VCFWriter closed by the GATK
+ *
+ * If closeUnderlyingWriter is true, then we'll close the underlying writer, otherwise we'll leave it open
+ * so the GATK closes it later
+ *
+ * @param closeUnderlyingWriter should we leave the underlying writer open or closed?
+ */
+ public void close(final boolean closeUnderlyingWriter) {
+ emitCurrentBlock();
+ if ( closeUnderlyingWriter ) underlyingWriter.close();
+ }
+
+ /**
+ * Add hom-ref site from vc to this gVCF hom-ref state tracking, emitting any pending states if appropriate
+ *
+ * @param vc a non-null VariantContext
+ * @param g a non-null genotype from VariantContext
+ * @return a VariantContext to be emitted, or null if non is appropriate
+ */
+ protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) {
+ if ( nextAvailableStart != -1 && vc.getStart() <= nextAvailableStart ) {
+ // don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions)
+ return null;
+ } else if ( currentBlock == null ) {
+ currentBlock = createNewBlock(vc, g);
+ return null;
+ } else if ( currentBlock.withinBounds(g.getGQ()) ) {
+ currentBlock.add(vc.getStart(), g);
+ return null;
+ } else {
+ final VariantContext result = blockToVCF(currentBlock);
+ currentBlock = createNewBlock(vc, g);
+ return result;
+ }
+ }
+
+ /**
+ * Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null
+ */
+ private void emitCurrentBlock() {
+ if ( currentBlock != null ) {
+ // there's actually some work to do
+ underlyingWriter.add(blockToVCF(currentBlock));
+ currentBlock = null;
+ }
+ }
+
+ /**
+ * Convert a HomRefBlock into a VariantContext
+ *
+ * @param block the block to convert
+ * @return a VariantContext representing the gVCF encoding for this block
+ */
+ private VariantContext blockToVCF(final HomRefBlock block) {
+ if ( block == null ) throw new IllegalArgumentException("block cannot be null");
+
+ final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC());
+ vcb.attributes(new HashMap(2)); // clear the attributes
+ vcb.stop(block.getStop());
+ vcb.attribute(VCFConstants.END_KEY, block.getStop());
+ vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize());
+
+ // create the single Genotype with GQ and DP annotations
+ final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef()));
+ gb.noAD().noPL().noAttributes(); // clear all attributes
+ gb.GQ(block.getMedianGQ());
+ gb.DP(block.getMedianDP());
+ gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP());
+ gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ());
+
+ return vcb.genotypes(gb.make()).make();
+ }
+
+ /**
+ * Helper function to create a new HomRefBlock from a variant context and current genotype
+ *
+ * @param vc the VariantContext at the site where want to start the band
+ * @param g the genotype of the sample from vc that should be used to initialize the block
+ * @return a newly allocated and initialized block containing g already
+ */
+ private HomRefBlock createNewBlock(final VariantContext vc, final Genotype g) {
+ // figure out the GQ limits to use based on the GQ of g
+ HomRefBlock partition = null;
+ for ( final HomRefBlock maybePartition : GQPartitions ) {
+ if ( maybePartition.withinBounds(g.getGQ()) ) {
+ partition = maybePartition;
+ break;
+ }
+ }
+ if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition);
+
+ // create the block, add g to it, and return it for use
+ final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound());
+ block.add(vc.getStart(), g);
+ return block;
+ }
+
+ /**
+ * Add a VariantContext to this writer for emission
+ *
+ * Requires that the VC have exactly one genotype
+ *
+ * @param vc a non-null VariantContext
+ */
+ @Override
+ public void add(VariantContext vc) {
+ if ( vc == null ) throw new IllegalArgumentException("vc cannot be null");
+
+ if ( sampleName == null )
+ sampleName = vc.getGenotype(0).getSampleName();
+
+ if ( ! vc.hasGenotypes() ) {
+ throw new IllegalArgumentException("GVCF assumes that the VariantContext has genotypes");
+ } else if ( vc.getGenotypes().size() != 1 ) {
+ throw new IllegalArgumentException("GVCF assumes that the VariantContext has exactly one genotype but saw " + vc.getGenotypes().size());
+ } else {
+ if ( currentBlock != null && ! currentBlock.isContiguous(vc) ) {
+ // we've made a non-contiguous step (across interval, onto another chr), so finalize
+ emitCurrentBlock();
+ }
+
+ final Genotype g = vc.getGenotype(0);
+ if ( g.isHomRef() ) {
+ // create bands
+ final VariantContext maybeCompletedBand = addHomRefSite(vc, g);
+ if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand);
+ } else {
+ // g is variant, so flush the bands and emit vc
+ emitCurrentBlock();
+ nextAvailableStart = vc.getEnd();
+ underlyingWriter.add(vc);
+ }
+ }
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java
new file mode 100644
index 000000000..282e49217
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java
@@ -0,0 +1,169 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.utils.gvcf;
+
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.Genotype;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+import org.broadinstitute.variant.vcf.VCFHeaderLine;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Helper class for calculating a GQ band in the GVCF writer
+ *
+ * A band contains GQ and DP values for a contiguous stretch of hom-ref genotypes,
+ * and provides summary information about the entire block of genotypes.
+ *
+ * Genotypes within the HomRefBlock are restricted to hom-ref genotypes within a band of GQ scores
+ *
+ * User: depristo
+ * Date: 6/25/13
+ * Time: 9:41 AM
+ */
+final class HomRefBlock {
+ private final VariantContext startingVC;
+ int stop;
+ private final int minGQ, maxGQ;
+ private List GQs = new ArrayList<>(100);
+ private List DPs = new ArrayList<>(100);
+ private final Allele ref;
+
+ /**
+ * Create a new HomRefBlock
+ *
+ * @param startingVC the VariantContext that starts this band (for starting position information)
+ * @param minGQ the minGQ (inclusive) to use in this band
+ * @param maxGQ the maxGQ (exclusive) to use in this band
+ */
+ public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) {
+ if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null");
+ if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ);
+
+ this.startingVC = startingVC;
+ this.stop = getStart() - 1;
+ this.ref = startingVC.getReference();
+ this.minGQ = minGQ;
+ this.maxGQ = maxGQ;
+ }
+
+ /**
+ * Create a new HomRefBlock only for doing bounds checking
+ *
+ * @param minGQ the minGQ (inclusive) to use in this band
+ * @param maxGQ the maxGQ (exclusive) to use in this band
+ */
+ public HomRefBlock(int minGQ, int maxGQ) {
+ if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ);
+
+ this.startingVC = null;
+ this.stop = -1;
+ this.ref = null;
+ this.minGQ = minGQ;
+ this.maxGQ = maxGQ;
+ }
+
+ /**
+ * Add information from this Genotype to this band
+ * @param g a non-null Genotype with GQ and DP attributes
+ */
+ public void add(final int pos, final Genotype g) {
+ if ( g == null ) throw new IllegalArgumentException("g cannot be null");
+ if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field");
+ if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field");
+ if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop);
+
+ stop = pos;
+ GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission
+ DPs.add(g.getDP());
+ }
+
+ /**
+ * Is the GQ value within the bounds of this GQ (GQ >= minGQ && GQ < maxGQ)
+ * @param GQ the GQ value to test
+ * @return true if within bounds, false otherwise
+ */
+ public boolean withinBounds(final int GQ) {
+ return GQ >= minGQ && GQ < maxGQ;
+ }
+
+ /** Get the min GQ observed within this band */
+ public int getMinGQ() { return MathUtils.arrayMin(GQs); }
+ /** Get the median GQ observed within this band */
+ public int getMedianGQ() { return MathUtils.median(GQs); }
+ /** Get the min DP observed within this band */
+ public int getMinDP() { return MathUtils.arrayMin(DPs); }
+ /** Get the median DP observed within this band */
+ public int getMedianDP() { return MathUtils.median(DPs); }
+
+ protected int getGQUpperBound() { return maxGQ; }
+ protected int getGQLowerBound() { return minGQ; }
+
+ public boolean isContiguous(final VariantContext vc) {
+ return vc.getEnd() == getStop() + 1 && startingVC.getChr().equals(vc.getChr());
+ }
+
+ public VariantContext getStartingVC() { return startingVC; }
+ public int getStart() { return startingVC.getStart(); }
+ public int getStop() { return stop; }
+ public Allele getRef() { return ref; }
+ public int getSize() { return getStop() - getStart() + 1; }
+
+ @Override
+ public String toString() {
+ return "HomRefBlock{" +
+ "minGQ=" + minGQ +
+ ", maxGQ=" + maxGQ +
+ '}';
+ }
+
+ public VCFHeaderLine toVCFHeaderLine() {
+ return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)");
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java
index e7e5cf0e1..2a435f10d 100644
--- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java
@@ -46,11 +46,10 @@
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
-import net.sf.samtools.*;
import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
@@ -67,17 +66,17 @@ import java.util.*;
* Time: 1:50 PM
*/
class AllHaplotypeBAMWriter extends HaplotypeBAMWriter {
- public AllHaplotypeBAMWriter(final SAMFileWriter bamWriter) {
- super(bamWriter);
+ public AllHaplotypeBAMWriter(final ReadDestination destination) {
+ super(destination);
}
/**
* {@inheritDoc}
*/
@Override
- public void writeReadsAlignedToHaplotypes(final List haplotypes,
+ public void writeReadsAlignedToHaplotypes(final Collection haplotypes,
final GenomeLoc paddedReferenceLoc,
- final List bestHaplotypes,
+ final Collection bestHaplotypes,
final Set calledHaplotypes,
final Map stratifiedReadMap) {
writeHaplotypesAsReads(haplotypes, new HashSet<>(bestHaplotypes), paddedReferenceLoc);
diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java
index 7206dd674..c298485f6 100644
--- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java
@@ -68,17 +68,17 @@ import java.util.*;
* Time: 1:50 PM
*/
class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
- public CalledHaplotypeBAMWriter(final SAMFileWriter bamWriter) {
- super(bamWriter);
+ public CalledHaplotypeBAMWriter(final ReadDestination destination) {
+ super(destination);
}
/**
* {@inheritDoc}
*/
@Override
- public void writeReadsAlignedToHaplotypes(final List haplotypes,
+ public void writeReadsAlignedToHaplotypes(final Collection haplotypes,
final GenomeLoc paddedReferenceLoc,
- final List bestHaplotypes,
+ final Collection bestHaplotypes,
final Set calledHaplotypes,
final Map stratifiedReadMap) {
if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes
@@ -98,10 +98,8 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
// next, output the interesting reads for each sample aligned against one of the called haplotypes
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
for ( final Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
- if ( entry.getKey().getMappingQuality() > 0 ) {
- final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
- writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative());
- }
+ final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
+ writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative());
}
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java
index 1afbeed63..509399fd9 100644
--- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java
@@ -46,16 +46,18 @@
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
-import net.sf.samtools.*;
+import net.sf.samtools.Cigar;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMTag;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.haplotype.Haplotype;
-import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import java.util.*;
@@ -75,8 +77,8 @@ public abstract class HaplotypeBAMWriter {
protected final static String READ_GROUP_ID = "ArtificialHaplotype";
protected final static String HAPLOTYPE_TAG = "HC";
- final SAMFileWriter bamWriter;
- final SAMFileHeader bamHeader;
+ final ReadDestination output;
+ boolean writeHaplotypesAsWell = true;
/**
* Possible modes for writing haplotypes to BAMs
@@ -104,27 +106,10 @@ public abstract class HaplotypeBAMWriter {
* @return a new HaplotypeBAMWriter
*/
public static HaplotypeBAMWriter create(final Type type, final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header) {
- if ( header == null ) throw new IllegalArgumentException("header cannot be null");
- if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null");
if ( type == null ) throw new IllegalArgumentException("type cannot be null");
- // prepare the bam header
- final SAMFileHeader bamHeader = new SAMFileHeader();
- bamHeader.setSequenceDictionary(header.getSequenceDictionary());
- bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
-
- // include the original read groups plus a new artificial one for the haplotypes
- final List readGroups = new ArrayList(header.getReadGroups());
- final SAMReadGroupRecord rg = new SAMReadGroupRecord(READ_GROUP_ID);
- rg.setSample("HC");
- rg.setSequencingCenter("BI");
- readGroups.add(rg);
- bamHeader.setReadGroups(readGroups);
-
- // TODO -- this will be a performance problem at high-scale
- stingSAMWriter.setPresorted(false);
- stingSAMWriter.writeHeader(bamHeader);
- return create(type, stingSAMWriter);
+ final ReadDestination toBam = new ReadDestination.ToBAM(stingSAMWriter, header, READ_GROUP_ID);
+ return create(type, toBam);
}
/**
@@ -134,16 +119,16 @@ public abstract class HaplotypeBAMWriter {
* may come in out of order during writing
*
* @param type the type of the writer we want to create
- * @param writer the destination, must not be null
+ * @param destination the destination, must not be null
* @return a new HaplotypeBAMWriter
*/
- public static HaplotypeBAMWriter create(final Type type, final SAMFileWriter writer) {
- if ( writer == null ) throw new IllegalArgumentException("writer cannot be null");
+ public static HaplotypeBAMWriter create(final Type type, final ReadDestination destination) {
+ if ( destination == null ) throw new IllegalArgumentException("writer cannot be null");
if ( type == null ) throw new IllegalArgumentException("type cannot be null");
switch ( type ) {
- case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(writer);
- case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(writer);
+ case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(destination);
+ case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(destination);
default: throw new IllegalArgumentException("Unknown type " + type);
}
}
@@ -154,11 +139,10 @@ public abstract class HaplotypeBAMWriter {
* Assumes that the header has been fully initialized with a single
* read group READ_GROUP_ID
*
- * @param bamWriter our output destination
+ * @param output our output destination
*/
- protected HaplotypeBAMWriter(SAMFileWriter bamWriter) {
- this.bamWriter = bamWriter;
- this.bamHeader = bamWriter.getFileHeader();
+ protected HaplotypeBAMWriter(final ReadDestination output) {
+ this.output = output;
}
/**
@@ -170,12 +154,18 @@ public abstract class HaplotypeBAMWriter {
* @param calledHaplotypes a list of the haplotypes at where actually called as non-reference
* @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes
*/
- public abstract void writeReadsAlignedToHaplotypes(final List haplotypes,
+ public abstract void writeReadsAlignedToHaplotypes(final Collection haplotypes,
final GenomeLoc paddedReferenceLoc,
- final List bestHaplotypes,
+ final Collection bestHaplotypes,
final Set calledHaplotypes,
final Map stratifiedReadMap);
+ public void writeReadsAlignedToHaplotypes(final Collection haplotypes,
+ final GenomeLoc paddedReferenceLoc,
+ final Map stratifiedReadMap) {
+ writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, haplotypes, new HashSet<>(haplotypes), stratifiedReadMap);
+ }
+
/**
* Write out read aligned to haplotype to the BAM file
*
@@ -193,7 +183,7 @@ public abstract class HaplotypeBAMWriter {
final boolean isInformative) {
final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative);
if ( alignedToRef != null )
- bamWriter.addAlignment(alignedToRef);
+ output.add(alignedToRef);
}
/**
@@ -281,8 +271,9 @@ public abstract class HaplotypeBAMWriter {
protected void writeHaplotypesAsReads(final Collection haplotypes,
final Set bestHaplotypes,
final GenomeLoc paddedReferenceLoc) {
- for ( final Haplotype haplotype : haplotypes )
- writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype));
+ if ( isWriteHaplotypesAsWell() )
+ for ( final Haplotype haplotype : haplotypes )
+ writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype));
}
/**
@@ -295,7 +286,7 @@ public abstract class HaplotypeBAMWriter {
private void writeHaplotype(final Haplotype haplotype,
final GenomeLoc paddedRefLoc,
final boolean isAmongBestHaplotypes) {
- final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
+ final GATKSAMRecord record = new GATKSAMRecord(output.getHeader());
record.setReadBases(haplotype.getBases());
record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
@@ -307,6 +298,14 @@ public abstract class HaplotypeBAMWriter {
record.setReferenceIndex(paddedRefLoc.getContigIndex());
record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID);
record.setFlags(16);
- bamWriter.addAlignment(record);
+ output.add(record);
+ }
+
+ public boolean isWriteHaplotypesAsWell() {
+ return writeHaplotypesAsWell;
+ }
+
+ public void setWriteHaplotypesAsWell(boolean writeHaplotypesAsWell) {
+ this.writeHaplotypesAsWell = writeHaplotypesAsWell;
}
}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java
new file mode 100644
index 000000000..007b5402e
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java
@@ -0,0 +1,135 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.utils.haplotypeBAMWriter;
+
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMFileWriter;
+import net.sf.samtools.SAMReadGroupRecord;
+import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Utility class that allows us to easily create destinations for the HaplotypeBAMWriters
+ *
+ * User: depristo
+ * Date: 6/19/13
+ * Time: 10:19 AM
+ */
+public abstract class ReadDestination {
+ public abstract void add(final GATKSAMRecord read);
+
+ private final SAMFileHeader bamHeader;
+
+ public SAMFileHeader getHeader() {
+ return bamHeader;
+ }
+
+ protected ReadDestination(final SAMFileHeader header, final String readGroupID) {
+ // prepare the bam header
+ if ( header == null ) throw new IllegalArgumentException("header cannot be null");
+ bamHeader = new SAMFileHeader();
+ bamHeader.setSequenceDictionary(header.getSequenceDictionary());
+ bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
+
+ // include the original read groups plus a new artificial one for the haplotypes
+ final List readGroups = new ArrayList(header.getReadGroups());
+ final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupID);
+ rg.setSample("HC");
+ rg.setSequencingCenter("BI");
+ readGroups.add(rg);
+ bamHeader.setReadGroups(readGroups);
+ }
+
+ public static class ToBAM extends ReadDestination {
+ final SAMFileWriter bamWriter;
+
+ /**
+ * Create a ReadDestination that writes to a BAM file
+ */
+ public ToBAM(final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header, final String readGroupID) {
+ super(header, readGroupID);
+ if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null");
+
+ bamWriter = stingSAMWriter;
+ stingSAMWriter.setPresorted(false);
+ stingSAMWriter.writeHeader(getHeader());
+ }
+
+ @Override
+ public void add(GATKSAMRecord read) {
+ bamWriter.addAlignment(read);
+ }
+ }
+
+ public static class ToList extends ReadDestination {
+ final List reads = new LinkedList<>();
+
+ /**
+ * Create a ReadDestination that captures the output reads in a list of reads
+ */
+ public ToList(SAMFileHeader header, String readGroupID) {
+ super(header, readGroupID);
+ }
+
+ @Override
+ public void add(GATKSAMRecord read) {
+ reads.add(read);
+ }
+
+ /**
+ * Get the reads that have been written to this destination
+ * @return a non-null list of reads
+ */
+ public List getReads() {
+ return reads;
+ }
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
index 184a2689d..49148c152 100644
--- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
@@ -59,6 +59,9 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
protected static final double INITIAL_CONDITION = Math.pow(2, 1020);
protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION);
+ // we divide e by 3 because the observed base could have come from any of the non-observed alleles
+ protected static final double TRISTATE_CORRECTION = 3.0;
+
private static final int matchToMatch = 0;
private static final int indelToMatch = 1;
private static final int matchToInsertion = 2;
@@ -146,7 +149,7 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
- QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
+ QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) );
}
}
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index e7d7300ae..37dc7adba 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -172,6 +172,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("getting DB tag with HM3", spec);
}
+ @Test
+ public void testDBTagWithTwoComps() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf --comp:foo " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1,
+ Arrays.asList("6afbf05090ae139f53467cf6e0e71cf4"));
+ executeTest("getting DB tag with 2 comps", spec);
+ }
+
@Test
public void testNoQuals() {
WalkerTestSpec spec = new WalkerTestSpec(
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java
index 77c9f96c9..fd1f0de8a 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java
@@ -48,49 +48,24 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
-import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.junit.Assert;
+import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
+import java.io.File;
import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
import java.util.Random;
public class BiasedDownsamplingIntegrationTest extends WalkerTest {
- private final static String baseCommand1 = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
- private final static String baseCommand2 = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:1,000,000-5,000,000";
- private final static String baseCommand3 = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:4,000,000-5,000,000";
+ private final static String baseCommandUG = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:4,000,000-5,000,000";
+ private final static String baseCommandHC = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:4,000,000-5,000,000" + " --useFilteredReadsForAnnotations";
+
private final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/";
- // --------------------------------------------------------------------------------------------------------------
- //
- // testing UnifiedGenotyper contamination down-sampling
- //
- // --------------------------------------------------------------------------------------------------------------
-
- @Test(enabled = false)
- public void testContaminationDownsamplingFlat() {
- WalkerTestSpec spec = new WalkerTestSpec(
- baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contamination 0.20", 1,
- Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb"));
- executeTest("test contamination_percentage_to_filter 0.20", spec);
- }
-
- @Test(enabled = false)
- public void testContaminationDownsamplingFlatAndPerSample() {
- WalkerTestSpec spec = new WalkerTestSpec(
- baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_per_sample_file " + ArtificalBAMLocation + "NA12878.NA19240.contam.txt --contamination_fraction_to_filter 0.10", 1,
- Arrays.asList("53395814dd6990448a01a294ccd69bd2"));
- executeTest("test contamination_percentage_to_filter per-sample and .20 overall", spec);
- }
-
- @Test(enabled = false)
- public void testContaminationDownsamplingPerSampleOnly() {
- WalkerTestSpec spec = new WalkerTestSpec(
- baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contaminationFile " + ArtificalBAMLocation + "NA19240.contam.txt", 1,
- Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb"));
- executeTest("test contamination_percentage_to_filter per-sample", spec);
- }
-
// --------------------------------------------------------------------------------------------------------------
//
@@ -98,150 +73,49 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
- @Test(enabled = false)
+ @Test
private void testDefaultContamination() {
final String bam1 = "NA11918.with.1.NA12842.reduced.bam";
final String bam2 = "NA12842.with.1.NA11918.reduced.bam";
WalkerTestSpec spec = new WalkerTestSpec(
- baseCommand2 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s ", 1,
- Arrays.asList("e2e5a8dd313f8d7e382e7d49dfac59a2"));
- executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " with default downsampling.", spec);
+ baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination .05 ", 1,
+ Arrays.asList("b13612312ff991cf40ddc44255e76ecd"));
+ executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " with .05 downsampling.", spec);
}
- private void testFlatContamination(final String bam1, final String bam2, final Double downsampling, final String md5) {
- WalkerTestSpec spec = new WalkerTestSpec(
- baseCommand2 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination " + downsampling.toString(), 1,
- Arrays.asList(md5));
- executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " downsampling " + downsampling.toString(), spec);
- }
-
- @Test(enabled = false)
- public void testFlatContaminationCase1() {
- testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "e2e5a8dd313f8d7e382e7d49dfac59a2");
- }
-
- @Test(enabled = false)
- public void testFlatContaminationCase2() {
- testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "549737002f98775fea8f46e7ea174dde");
- }
-
- @Test(enabled = false)
- public void testFlatContaminationCase3() {
- testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "529d82c2a33fcc303a5dc55de2d56979");
- }
-
- @Test(enabled = false)
- public void testFlatContaminationCase4() {
- testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.1, "b5689972fbb7d230a372ee5f0da1c6d7");
- }
-
- @Test(enabled = false)
- public void testFlatContaminationCase5() {
- testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.2, "9dceee2e921b53fbc1ce137a7e0b7b74");
- }
-
- @Test(enabled = false)
- public void testFlatContaminationCase6() {
- testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.3, "d6a74061033503af80dcaea065bfa075");
- }
-
- @Test(enabled = false)
- public void testFlatContaminationCase7() {
- testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "7d1b5efab58a1b8f9d99fcf5af82f15a");
- }
-
- @Test(enabled = false)
- public void testFlatContaminationCase8() {
- testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "a7f8d5c79626aff59d7f426f79d8816e");
- }
-
- @Test(enabled = false)
- public void testFlatContaminationCase9() {
- testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.3, "fcf482398b7c908e3e2d1e4d5da6377b");
- }
-
- private void testPerSampleContamination(String bam1, String bam2, String persampleFile, final String md5) {
- WalkerTestSpec spec = new WalkerTestSpec(
- baseCommand2 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contaminationFile " + persampleFile, 1,
- Arrays.asList(md5));
- executeTest("test contamination on Artificial Contamination (per-sample) on " + bam1 + " and " + bam2 + " with " + persampleFile, spec);
- }
-
- @Test(enabled = false)
- public void testPerSampleContaminationCase1() {
- testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "e00278527a294833259e9e411728e395");
- }
-
- @Test(enabled = false)
- public void testPerSampleContaminationCase2() {
- testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "a443e793f0b0e2ffce1b751634d706e2");
- }
-
- @Test(enabled = false)
- public void testPerSampleContaminationCase3() {
- testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "e11d83a7815ce757afbcf7689568cb25");
- }
-
- @Test(enabled = false)
- public void testPerSampleContaminationCase4() {
- testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "615042eeeffe042bd1c86279d34f80b6");
- }
-
- @Test(enabled = false)
- public void testPerSampleContaminationCase5() {
- testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "9bc99fc79ca34744bf26cb19ee4ef44d");
- }
-
- @Test(enabled = false)
- public void testPerSampleContaminationCase6() {
- testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "143626fe5fce765d6c997a64f058a813");
- }
-
- @Test(enabled = false)
- public void testPerSampleContaminationCase7() {
- testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "f2593674cef894eda4e0be9cf3158f57");
- }
-
- @Test(enabled = false)
- public void testPerSampleContaminationCase8() {
- testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "fb7ce0740767ae3896b3e552026da1e4");
- }
-
-
- private void testPerSampleEqualsFlat(final String bam1, final String bam2, final String persampleFile, final Double downsampling, final String md5) {
- final String command = baseCommand3 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s ";
-
- WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList(md5));
- final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
-
- rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
- executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec);
-
- spec = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList(md5));
-
- rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
- executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec);
-
- }
// verify that inputing a file with an effectively flat contamination level is equivalent to handing in a flat contamination level
- @Test(enabled = false)
- public void testPerSampleEqualsFlatContaminationCase1() {
- testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0, "");
+
+ @DataProvider(name="PerSampleEqualFlatContamBams")
+ public Object[][] makePerSampleEqualFlatContamBams() {
+ final List