Merge branch 'master' into st_fpga_hmm

Conflicts:
	protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
This commit is contained in:
sathibault 2013-07-15 08:17:32 -05:00
commit 0a8f75b953
77 changed files with 3309 additions and 1362 deletions

View File

@ -887,6 +887,27 @@
<fail message="No executable defined. Call a more specific packaging/release target, or define an executable manually" if="no.executable.defined" />
</target>
<target name="require.bcel">
<fileset id="bcel.jar" dir="${user.home}/.ant/lib">
<include name="bcel-*.jar" />
</fileset>
<pathconvert refid="bcel.jar" property="bcel.jar.installed" setonempty="false" />
<fileset id="ant.bcel.jar" dir="${user.home}/.ant/lib">
<include name="ant-apache-bcel-*.jar" />
</fileset>
<pathconvert refid="ant.bcel.jar" property="ant.bcel.jar.installed" setonempty="false" />
<condition property="bcel.installed">
<and>
<isset property="bcel.jar.installed" />
<isset property="ant.bcel.jar.installed" />
</and>
</condition>
<fail unless="bcel.installed"
message="Required bcel libraries for GATK packaging not installed in ${user.home}/.ant/lib/${line.separator}The bcel jar can be found in the lib directory of a GATK clone after compiling, and the ant-apache-bcel jar can be downloaded from here: http://repo1.maven.org/maven2/ant/ant-apache-bcel/1.6.5/ant-apache-bcel-1.6.5.jar${line.separator}Please copy these two jar files to ${user.home}/.ant/lib/" />
</target>
<!-- Unzip all classes from their current locations and assemble them in a staging directory -->
<target name="stage" description="stage files for distribution">
<mkdir dir="${staging.dir}"/>
@ -910,7 +931,7 @@
<!-- Build a package consisting of all supporting files. Don't call this target directly. Call one of the specific packaging targets below -->
<target name="package" depends="require.clean,dist,stage,require.executable" description="bundle up an executable for distribution">
<target name="package" depends="require.clean,require.bcel,dist,stage,require.executable" description="bundle up an executable for distribution">
<mkdir dir="${package.output.dir}" />
<xslt destdir="${package.output.dir}" style="${package.xml.dir}/CreatePackager.xsl" useImplicitFileset="false">
<flattenmapper/>

View File

@ -1,22 +0,0 @@
Copyright (c) 2012 The Broad Institute
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Binary file not shown.

View File

@ -54,8 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@ -85,9 +83,6 @@ public class StandardCallerArgumentCollection {
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
/**
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
@ -150,7 +145,7 @@ public class StandardCallerArgumentCollection {
*/
@Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
public static final double DEFAULT_CONTAMINATION_FRACTION = 0.05;
public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0;
/**
* This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples.
@ -199,7 +194,6 @@ public class StandardCallerArgumentCollection {
this.heterozygosity = SCAC.heterozygosity;
this.INDEL_HETEROZYGOSITY = SCAC.INDEL_HETEROZYGOSITY;
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
this.OutputMode = SCAC.OutputMode;
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;

View File

@ -320,18 +320,17 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
if (toolkit.getIntervals() != null)
intervalList.addAll(toolkit.getIntervals());
final boolean preSorted = true;
final boolean indexOnTheFly = true;
final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
if (nwayout) {
SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, preSorted, indexOnTheFly, NO_PG_TAG, programRecord, true);
writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, false, indexOnTheFly, NO_PG_TAG, programRecord, true);
}
else {
writerToUse = out;
out.setPresorted(false);
if (!NO_PG_TAG) {
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME);
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME);
}
}
}

View File

@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.genotyper.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -94,7 +95,7 @@ import static java.lang.Math.pow;
*/
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;
public final static double DEFAULT_PCR_ERROR_RATE = FragmentUtils.DEFAULT_PCR_ERROR_RATE;
protected final static int FIXED_PLOIDY = 2;
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;

View File

@ -214,6 +214,9 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
/**
* Create a new UAC with defaults for all UAC arguments
*/
@ -262,6 +265,8 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES;
this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO;
this.pairHMM = uac.pairHMM;
this.OutputMode = uac.OutputMode;
this.annotateAllSitesWithPLs = uac.annotateAllSitesWithPLs;
// todo- arguments to remove
this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES;

View File

@ -46,105 +46,44 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
/**
* Fast approach to building a DeBruijnGraph
*
* Follows the model:
*
* for each X that has bases for the final graph:
* addKmer pair (single kmer with kmer size + 1 spanning the pair)
*
* flushKmersToGraph
* Result of assembling, with the resulting graph and status
*
* User: depristo
* Date: 4/7/13
* Time: 4:14 PM
* Date: 7/1/13
* Time: 5:35 PM
*/
public class DeBruijnGraphBuilder {
/** The size of the kmer graph we want to build */
private final int kmerSize;
/** The graph we're going to add kmers to */
private final DeBruijnGraph graph;
/** keeps counts of all kmer pairs added since the last flush */
private final KMerCounter counter;
public class AssemblyResult {
private final Status status;
private final SeqGraph graph;
/**
* Create a new builder that will write out kmers to graph
*
* @param graph a non-null graph that can contain already added kmers
* Create a new assembly result
* @param status the status, cannot be null
* @param graph the resulting graph of the assembly, can only be null if result is failed
*/
public DeBruijnGraphBuilder(final DeBruijnGraph graph) {
if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null");
this.kmerSize = graph.getKmerSize();
public AssemblyResult(final Status status, final SeqGraph graph) {
if ( status == null ) throw new IllegalArgumentException("status cannot be null");
if ( status != Status.FAILED && graph == null ) throw new IllegalArgumentException("graph is null but status is " + status);
this.status = status;
this.graph = graph;
this.counter = new KMerCounter(kmerSize + 1);
}
/**
* The graph we're building
* @return a non-null graph
*/
public DeBruijnGraph getGraph() {
return graph;
}
public Status getStatus() { return status; }
public SeqGraph getGraph() { return graph; }
/**
* The kmer size of our graph
* @return positive integer
* Status of the assembly result
*/
public int getKmerSize() {
return kmerSize;
}
/**
* Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this
* graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The
* edge between them is added with isRef and multiplicity
*
* @param sequence a sequence of bases from which we want to extract a pair of kmers
* @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize()
* @param multiplicity what's the multiplicity of the edge between these two kmers
*/
public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final int multiplicity ) {
if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null");
if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start);
if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length);
final Kmer kmerPair = new Kmer(sequence, start, getKmerSize() + 1);
addKmerPair(kmerPair, multiplicity);
}
/**
* Add a single kmer pair to this builder
* @param kmerPair a kmer pair is a single kmer that has kmerSize + 1 bp, where 0 -> kmersize and 1 -> kmersize + 1
* will have an edge added to this
* @param multiplicity the desired multiplicity of this edge
*/
public void addKmerPair(final Kmer kmerPair, final int multiplicity) {
if ( kmerPair.length() != kmerSize + 1 ) throw new IllegalArgumentException("kmer pair must be of length kmerSize + 1 = " + kmerSize + 1 + " but got " + kmerPair.length());
counter.addKmer(kmerPair, multiplicity);
}
/**
* Flushes the currently added kmers to the graph
*
* After this function is called the builder is reset to an empty state
*
* This flushing is expensive, so many kmers should be added to the builder before flushing. The most
* efficient workflow is to add all of the kmers of a particular class (all ref bases, or all read bases)
* then and do one flush when completed
*
* @param addRefEdges should the kmers present in the builder be added to the graph with isRef = true for the edges?
*/
public void flushKmersToGraph(final boolean addRefEdges) {
for ( final KMerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) {
final byte[] first = countedKmer.getKmer().subKmer(0, kmerSize).bases();
final byte[] second = countedKmer.getKmer().subKmer(1, kmerSize).bases();
graph.addKmersToGraph(first, second, addRefEdges, countedKmer.getCount());
}
counter.clear();
public enum Status {
/** Something went wrong, and we couldn't produce a meaningful graph */
FAILED,
/** Assembly succeeded, but graph degenerated into just the reference sequence */
JUST_ASSEMBLED_REFERENCE,
/** Assembly succeeded, and the graph has some meaningful structure */
ASSEMBLED_SOME_VARIATION
}
}

View File

@ -1,269 +0,0 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
/**
* DeBruijn assembler for the HaplotypeCaller
*
* User: ebanks, rpoplin
* Date: Mar 14, 2011
*/
public class DeBruijnAssembler extends LocalAssemblyEngine {
private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);
// TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should
// TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where
// TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases
private final static int NUM_PATHS_PER_GRAPH = 25;
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int GRAPH_KMER_STEP = 6;
private static final int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
private final int minKmer;
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
protected DeBruijnAssembler() {
this(25, -1);
}
public DeBruijnAssembler(final int minKmer, final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) {
super(NUM_PATHS_PER_GRAPH);
this.minKmer = minKmer;
this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
}
@Override
protected List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
final List<SeqGraph> graphs = new LinkedList<>();
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
if( maxKmer < minKmer) {
// Reads are too small for assembly so don't try to create any assembly graphs
return Collections.emptyList();
}
// create the graph for each possible kmer
for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) {
if ( debugGraphTransformations && kmer > onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms)
continue;
if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
DeBruijnGraph graph = createGraphFromSequences(reads, kmer, refHaplotype, activeAlleleHaplotypes);
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
if ( shouldErrorCorrectKmers() ) {
throw new UserException("Error correction no longer supported because of the " +
"incredibly naive way this was implemented. The command line argument remains because some" +
" future subsystem will actually go and error correct the reads");
}
final SeqGraph seqGraph = cleanupSeqGraph(graph.convertToSequenceGraph());
if ( seqGraph != null ) { // if the graph contains interesting variation from the reference
graphs.add(seqGraph);
if ( debugGraphTransformations ) // we only want to use one graph size
break;
}
}
}
return graphs;
}
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph);
// First pull kmers from the reference haplotype and add them to the graph
if ( ! addReferenceKmersToGraph(builder, refHaplotype.getBases()) )
// something went wrong, so abort right now with a null graph
return null;
// add the artificial GGA haplotypes to the graph
if ( ! addGGAKmersToGraph(builder, activeAlleleHaplotypes) )
// something went wrong, so abort right now with a null graph
return null;
// now go through the graph already seeded with the reference sequence and add the read kmers to it
if ( ! addReadKmersToGraph(builder, reads) )
// some problem was detected adding the reads to the graph, return null to indicate we failed
return null;
graph.cleanNonRefPaths();
return graph;
}
/**
* Add the high-quality kmers from the artificial GGA haplotypes to the graph
*
* @param builder a debruijn graph builder to add the read kmers to
* @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
*/
protected boolean addGGAKmersToGraph(final DeBruijnGraphBuilder builder, final List<Haplotype> activeAlleleHaplotypes) {
final int kmerLength = builder.getKmerSize();
for( final Haplotype haplotype : activeAlleleHaplotypes ) {
final int end = haplotype.length() - kmerLength;
for( int start = 0; start < end; start++ ) {
builder.addKmerPairFromSeqToGraph( haplotype.getBases(), start, GGA_MODE_ARTIFICIAL_COUNTS );
}
}
// always returns true now, but it's possible that we'd add kmers and decide we don't like the graph in some way
return true;
}
/**
* Add the high-quality kmers from the reads to the graph
*
* @param builder a debruijn graph builder to add the read kmers to
* @param reads a non-null list of reads whose kmers we want to add to the graph
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
*/
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads) {
final int kmerLength = builder.getKmerSize();
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
final byte[] sequence = read.getReadBases();
final byte[] qualities = read.getBaseQualities();
final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
if ( sequence.length > kmerLength + KMER_OVERLAP ) {
int lastGood = -1; // the index of the last good base we've seen
for( int end = 0; end < sequence.length; end++ ) {
if ( qualities[end] < minBaseQualityToUseInAssembly ) {
lastGood = -1; // reset the last good base
} else if ( lastGood == -1 ) {
lastGood = end; // we're at a good base, the last good one is us
} else if ( end - kmerLength >= lastGood ) {
// end - kmerLength (the start) is after the lastGood base, so that kmer is good
final int start = end - kmerLength;
// how many observations of this kmer have we seen? A normal read counts for 1, but
// a reduced read might imply a higher multiplicity for our the edge
int countNumber = 1;
if ( read.isReducedRead() ) {
// compute mean number of reduced read counts in current kmer span
// precise rounding can make a difference with low consensus counts
// TODO -- optimization: should extend arrayMax function to take start stop values
countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, start, end));
}
builder.addKmerPairFromSeqToGraph(sequence, start, countNumber);
}
}
}
}
builder.flushKmersToGraph(false);
// always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way
return true;
}
/**
* Add the kmers from the reference sequence to the DeBruijnGraph
*
* @param builder the graph to add the reference kmers to. Must be empty
* @param refSequence the reference sequence from which we'll get our kmers
* @return true if we succeeded in creating a good graph from the reference sequence, false otherwise
*/
protected boolean addReferenceKmersToGraph(final DeBruijnGraphBuilder builder, final byte[] refSequence) {
if ( builder == null ) throw new IllegalArgumentException("graph cannot be null");
if ( builder.getGraph().vertexSet().size() != 0 )
throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + builder.getGraph().vertexSet().size() + " vertices in it already: " + builder.getGraph());
if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null");
final int kmerLength = builder.getKmerSize();
if( refSequence.length < kmerLength + KMER_OVERLAP ) {
// not enough reference sequence to build a kmer graph of this length, return null
return false;
}
final int kmersInSequence = refSequence.length - kmerLength + 1;
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
builder.addKmerPairFromSeqToGraph(refSequence, iii, 1);
}
builder.flushKmersToGraph(true);
// we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices
if ( builder.getGraph().vertexSet().size() != kmersInSequence ) {
if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping");
return false;
}
return true;
}
@Override
public String toString() {
return "DeBruijnAssembler{" +
"minKmer=" + minKmer +
'}';
}
}

View File

@ -56,6 +56,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.EventMap;
@ -166,6 +167,8 @@ public class GenotypingEngine {
// Walk along each position in the key set and create each event to be outputted
final Set<Haplotype> calledHaplotypes = new HashSet<>();
final List<VariantContext> returnCalls = new ArrayList<>();
final Map<String, Double> emptyDownSamplingMap = new DefaultHashMap<>(0.0);
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
@ -197,13 +200,13 @@ public class GenotypingEngine {
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION );
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().getSampleContamination() );
final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC );
final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL);
if( call != null ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap :
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) );
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call);
@ -406,7 +409,7 @@ public class GenotypingEngine {
// BUGBUG: ugh, too complicated
protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
final Map<Allele, List<Haplotype>> alleleMapper,
final double downsamplingFraction ) {
final Map<String,Double> perSampleDownsamplingFraction ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<>();
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
@ -423,7 +426,7 @@ public class GenotypingEngine {
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
}
}
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(perSampleDownsamplingFraction.get(haplotypeReadMapEntry.getKey())); // perform contamination downsampling
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
}

View File

@ -47,9 +47,6 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileWriter;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
@ -58,9 +55,10 @@ import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -73,21 +71,25 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.gvcf.GVCFWriter;
import org.broadinstitute.sting.utils.haplotype.*;
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
@ -240,22 +242,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@ArgumentCollection
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
// -----------------------------------------------------------------------------------------------
// arguments to control internal behavior of the debruijn assembler
// -----------------------------------------------------------------------------------------------
@Advanced
@Argument(fullName="useDebruijnAssembler", shortName="useDebruijnAssembler", doc="If specified, we will use the old DeBruijn assembler. Depreciated as of 2.6", required = false)
protected boolean useDebruijnAssembler = false;
@Advanced
@Argument(fullName="minKmerForDebruijnAssembler", shortName="minKmerForDebruijnAssembler", doc="Minimum kmer length to use in the debruijn assembly graph", required = false)
protected int minKmerForDebruijnAssembler = 11;
@Advanced
@Argument(fullName="onlyUseKmerSizeForDebruijnAssembler", shortName="onlyUseKmerSizeForDebruijnAssembler", doc="If specified, we will only build kmer graphs with this kmer size in the debruijn", required = false)
protected int onlyUseKmerSizeForDebruijnAssembler = -1;
// -----------------------------------------------------------------------------------------------
// arguments to control internal behavior of the read threading assembler
// -----------------------------------------------------------------------------------------------
@ -298,7 +284,62 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// -----------------------------------------------------------------------------------------------
@Advanced
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
@Argument(fullName="emitRefConfidence", shortName="ERC", doc="Emit experimental reference confidence scores", required = false)
protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE;
public enum ReferenceConfidenceMode {
NONE,
BP_RESOLUTION,
GVCF
}
/**
* The GQ partition intervals
*
* Should be a non-empty list of boundaries. For example, suppose this variable is
*
* [A, B, C]
*
* We would partition our hom-ref sites into the following bands:
*
* X < A
* A <= X < B
* B <= X < C
* X >= C
*
* The default bands give the following GQ blocks:
*
* [0, 0]
* (0, 10]
* (10, 20]
* (20, 30]
* (30, 40]
* (40, 50]
* (50, 99]
*
* Note that in the GATK GQ values are capped at 99.
*/
@Advanced
@Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false)
protected List<Integer> GVCFGQBands = Arrays.asList(1, 10, 20, 30, 40, 50);
/**
* This parameter determines the maximum size of an indel considered as potentially segregating in the
* reference model. It is used to eliminate reads from being indel informative at a site, and determines
* by that mechanism the certainty in the reference base. Conceptually, setting this parameter to
* X means that each informative read is consistent with any indel of size < X being present at a specific
* position in the genome, given its alignment to the reference.
*/
@Advanced
@Argument(fullName="indelSizeToEliminateInRefModel", shortName="ERCIS", doc="The size of an indel to check for in the reference model", required = false)
protected int indelSizeToEliminateInRefModel = 10;
// -----------------------------------------------------------------------------------------------
// general advanced arguments to control haplotype caller behavior
// -----------------------------------------------------------------------------------------------
@Advanced
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 2;
@Advanced
@ -419,6 +460,27 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false)
protected int minObservationsForKmerToBeSolid = 20;
/**
* the maximum extent into the full active region extension that we're willing to go in genotyping our events
*/
@Hidden
@Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false)
protected int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25;
@Hidden
@Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false)
protected int MAX_GGA_ACTIVE_REGION_EXTENSION = 300;
/**
* Include at least this many bases around an event for calling it
*/
@Hidden
@Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false)
protected int PADDING_AROUND_OTHERS_FOR_CALLING = 150;
@Hidden
@Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false)
protected int PADDING_AROUND_SNPS_FOR_CALLING = 20;
// -----------------------------------------------------------------------------------------------
// done with Haplotype caller parameters
@ -445,14 +507,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// reference base padding size
private static final int REFERENCE_PADDING = 500;
// include at least this many bases around an event for calling it
private final static int PADDING_AROUND_SNPS_FOR_CALLING = 20;
private final static int PADDING_AROUND_OTHERS_FOR_CALLING = 150;
// the maximum extent into the full active region extension that we're willing to go in genotyping our events
private final static int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25;
private final static int MAX_GGA_ACTIVE_REGION_EXTENSION = 100;
private ActiveRegionTrimmer trimmer = null;
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
@ -470,6 +524,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
private final static Allele FAKE_ALT_ALLELE = Allele.create("<FAKE_ALT>", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
ReferenceConfidenceModel referenceConfidenceModel = null;
//---------------------------------------------------------------------------------------------------------------
//
// initialize
@ -488,6 +544,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final int nSamples = samples.size();
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
// HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this?
UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
@ -501,14 +560,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
// Currently, per-sample contamination level is only implemented for UG
if( UAC.CONTAMINATION_FRACTION_FILE !=null) {
throw new UserException("Per-Sample contamination level not supported in Haplotype Caller at this point");
UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger));
}
// when we do implement per-sample contamination for HC, this will probably be needed.
// UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, samples, logger));
// initialize the output VCF header
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
@ -532,6 +587,19 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// where the filters are used. For example, in emitting all sites the lowQual field is used
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel);
if ( emitReferenceConfidence() ) {
if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently");
headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines());
if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
try {
vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands);
} catch ( IllegalArgumentException e ) {
throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage());
}
}
}
vcfWriter.writeHeader(new VCFHeader(headerInfo, samples));
try {
@ -543,9 +611,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// create and setup the assembler
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
assemblyEngine = useDebruijnAssembler
? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples);
assemblyEngine = new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples);
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
@ -602,7 +668,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Override
public EnumSet<ActiveRegionReadState> desiredReadStates() {
if ( includeUnmappedReads ) {
throw new UserException.BadArgumentValue("includeUmappedReads", "is not yet functional");
throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional");
// return EnumSet.of(
// ActiveRegionReadState.PRIMARY,
// ActiveRegionReadState.NONPRIMARY,
@ -636,38 +702,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// if we don't have any data, just abort early
return new ActivityProfileState(ref.getLocus(), 0.0);
final List<Allele> noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied
noCall.add(Allele.NO_CALL);
final List<Allele> noCall = Collections.singletonList(Allele.NO_CALL); // used to noCall all genotypes until the exact model is applied
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage();
for( final Map.Entry<String, AlignmentContext> sample : splitContexts.entrySet() ) {
final double[] genotypeLikelihoods = new double[3]; // ref versus non-ref (any event)
Arrays.fill(genotypeLikelihoods, 0.0);
for( final PileupElement p : sample.getValue().getBasePileup() ) {
final byte qual = p.getQual();
if( p.isDeletion() || qual > (byte) 18) {
int AA = 0; final int AB = 1; int BB = 2;
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
AA = 2;
BB = 0;
if( p.isNextToSoftClip() ) {
averageHQSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
}
}
genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
}
}
final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), (byte) 18, averageHQSoftClips).genotypeLikelihoods;
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
}
final List<Allele> alleles = new ArrayList<>();
alleles.add( FAKE_REF_ALLELE );
alleles.add( FAKE_ALT_ALLELE );
final List<Allele> alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE);
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
@ -687,7 +731,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
return NO_CALLS;
if( !originalActiveRegion.isActive() ) { return NO_CALLS; } // Not active so nothing to do!
if( !originalActiveRegion.isActive() ) {
// Not active so nothing to do!
return referenceModelForNoVariation(originalActiveRegion, true);
}
final List<VariantContext> activeAllelesToGenotype = new ArrayList<>();
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
@ -697,23 +744,30 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
}
// No alleles found in this region so nothing to do!
if ( activeAllelesToGenotype.isEmpty() ) { return NO_CALLS; }
if ( activeAllelesToGenotype.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); }
} else {
if( originalActiveRegion.size() == 0 ) { return NO_CALLS; } // No reads here so nothing to do!
// No reads here so nothing to do!
if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); }
}
// run the local assembler, getting back a collection of information on how we should proceed
final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
// abort early if something is out of the acceptable range
if( ! assemblyResult.isVariationPresent() ) { return NO_CALLS; } // only the reference haplotype remains so nothing else to do!
if( ! assemblyResult.isVariationPresent() ) {
return referenceModelForNoVariation(originalActiveRegion, false);
} // only the reference haplotype remains so nothing else to do!
if (dontGenotype) return NO_CALLS; // user requested we not proceed
// filter out reads from genotyping which fail mapping quality based criteria
final Collection<GATKSAMRecord> filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
if( assemblyResult.regionForGenotyping.size() == 0 ) { return NO_CALLS; } // no reads remain after filtering so nothing else to do!
if( assemblyResult.regionForGenotyping.size() == 0 ) {
// no reads remain after filtering so nothing else to do!
return referenceModelForNoVariation(originalActiveRegion, false);
}
// evaluate each sample's reads against all haplotypes
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
@ -738,7 +792,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
if ( bamWriter != null ) {
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(
assemblyResult.haplotypes,
assemblyResult.paddedReferenceLoc,
assemblyResult.haplotypes,
calledHaplotypes.getCalledHaplotypes(),
stratifiedReadMap);
@ -746,7 +802,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
return calledHaplotypes.getCalls();
if ( emitReferenceConfidence() ) {
return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getRefHaplotype(),
calledHaplotypes.getCalledHaplotypes(), assemblyResult.paddedReferenceLoc, assemblyResult.regionForGenotyping,
stratifiedReadMap, calledHaplotypes.getCalls());
} else {
return calledHaplotypes.getCalls();
}
}
private final static class AssemblyResult {
@ -755,6 +817,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final byte[] fullReferenceWithPadding;
final GenomeLoc paddedReferenceLoc;
final boolean variationPresent;
final Haplotype refHaplotype;
private AssemblyResult(List<Haplotype> haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) {
this.haplotypes = haplotypes;
@ -762,6 +825,21 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
this.fullReferenceWithPadding = fullReferenceWithPadding;
this.paddedReferenceLoc = paddedReferenceLoc;
this.variationPresent = variationPresent;
Haplotype firstRefHaplotype = null;
for ( final Haplotype h : haplotypes ) {
if ( h.isReference() ) {
if ( firstRefHaplotype != null ) throw new IllegalArgumentException("Found two haplotypes marked as reference " + firstRefHaplotype + " and " + h);
firstRefHaplotype = h;
}
}
if ( firstRefHaplotype == null ) throw new IllegalArgumentException("Couldn't find a reference haplotype in " + haplotypes);
this.refHaplotype = firstRefHaplotype;
}
public Haplotype getRefHaplotype() {
return refHaplotype;
}
public boolean isVariationPresent() {
@ -780,7 +858,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
*/
protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List<VariantContext> activeAllelesToGenotype) {
// Create the reference haplotype which is the bases from the reference that make up the active region
finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails
finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
@ -793,7 +871,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
try {
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector );
if ( ! dontTrimActiveRegions ) {
if ( ! emitReferenceConfidence() && ! dontTrimActiveRegions ) {
return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc);
} else {
// we don't want to trim active regions, so go ahead and use the old one
@ -819,12 +897,54 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
* @return a non-null haplotype
*/
private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) {
final Haplotype refHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true);
refHaplotype.setAlignmentStartHapwrtRef(activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart());
final Cigar c = new Cigar();
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
refHaplotype.setCigar(c);
return refHaplotype;
return ReferenceConfidenceModel.createReferenceHaplotype(activeRegion, activeRegion.getActiveRegionReference(referenceReader), paddedReferenceLoc);
}
/**
* Create an ref model result (ref model or no calls depending on mode) for an active region without any variation
* (not is active, or assembled to just ref)
*
* @param region the region to return a no-variation result
* @param needsToBeFinalized should the region be finalized before computing the ref model (should be false if already done)
* @return a list of variant contexts (can be empty) to emit for this ref region
*/
private List<VariantContext> referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) {
if ( emitReferenceConfidence() ) {
if ( needsToBeFinalized ) finalizeActiveRegion(region);
filterNonPassingReads(region); // TODO -- remove when filtering is done in finalizeActiveRegion
final GenomeLoc paddedLoc = region.getExtendedLoc();
final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc);
final List<Haplotype> haplotypes = Collections.singletonList(refHaplotype);
return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes,
paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region),
Collections.<VariantContext>emptyList());
} else {
return NO_CALLS;
}
}
/**
* Create a context that maps each read to the reference haplotype with log10 L of 0
* @param refHaplotype a non-null reference haplotype
* @param samples a list of all samples
* @param region the active region containing reads
* @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref
*/
public static Map<String, PerReadAlleleLikelihoodMap> createDummyStratifiedReadMap(final Haplotype refHaplotype,
final List<String> samples,
final ActiveRegion region) {
final Allele refAllele = Allele.create(refHaplotype, true);
final Map<String, PerReadAlleleLikelihoodMap> map = new LinkedHashMap<>(1);
for ( final Map.Entry<String, List<GATKSAMRecord>> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) {
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
for ( final GATKSAMRecord read : entry.getValue() ) {
likelihoodMap.add(read, refAllele, 0.0);
}
map.put(entry.getKey(), likelihoodMap);
}
return map;
}
/**
@ -917,6 +1037,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Override
public void onTraversalDone(Integer result) {
if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
referenceConfidenceModel.close();
likelihoodCalculationEngine.close();
logger.info("Ran local assembly on " + result + " active regions");
}
@ -933,28 +1055,28 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// Loop through the reads hard clipping the adaptor and low quality tails
final List<GATKSAMRecord> readsToUse = new ArrayList<>(activeRegion.getReads().size());
for( final GATKSAMRecord myRead : activeRegion.getReads() ) {
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
GATKSAMRecord clippedRead;
if (errorCorrectReads)
clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION );
else if (useLowQualityBasesForAssembly)
clippedRead = postAdapterRead;
else // default case: clip low qual ends of reads
clippedRead= ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
GATKSAMRecord clippedRead;
if (errorCorrectReads)
clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION );
else if (useLowQualityBasesForAssembly)
clippedRead = myRead;
else // default case: clip low qual ends of reads
clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY );
if ( dontUseSoftClippedBases ) {
// uncomment to remove hard clips from consideration at all
clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
} else {
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
// TODO -- reference haplotype start must be removed
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
}
if ( dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(clippedRead) ) {
// remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all
clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
} else {
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
// TODO -- reference haplotype start must be removed
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
}
clippedRead = ( clippedRead.getReadUnmappedFlag() ? clippedRead : ReadClipper.hardClipAdaptorSequence( clippedRead ) );
if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) {
clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() );
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());
@ -963,8 +1085,15 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
}
// TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code.
final List<GATKSAMRecord> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
// handle overlapping read pairs from the same fragment
cleanOverlappingReadPairs(downsampledReads);
activeRegion.clearReads();
activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
activeRegion.addAll(downsampledReads);
}
private Set<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
@ -985,6 +1114,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
private Map<String, List<GATKSAMRecord>> splitReadsBySample( final Collection<GATKSAMRecord> reads ) {
return splitReadsBySample(samplesList, reads);
}
public static Map<String, List<GATKSAMRecord>> splitReadsBySample( final List<String> samplesList, final Collection<GATKSAMRecord> reads ) {
final Map<String, List<GATKSAMRecord>> returnMap = new HashMap<>();
for( final String sample : samplesList) {
List<GATKSAMRecord> readList = returnMap.get( sample );
@ -1000,5 +1133,24 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
return returnMap;
}
/**
* Are we emitting a reference confidence in some form, or not?
* @return true if we are
*/
private boolean emitReferenceConfidence(){
return emitReferenceConfidence != ReferenceConfidenceMode.NONE;
}
/**
* Clean up reads/bases that overlap within read pairs
*
* @param reads the list of reads to consider
*/
private void cleanOverlappingReadPairs(final List<GATKSAMRecord> reads) {
for ( final List<GATKSAMRecord> perSampleReadList : splitReadsBySample(reads).values() ) {
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create(perSampleReadList);
for ( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() )
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair);
}
}
}

View File

@ -51,17 +51,12 @@ import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
@ -82,7 +77,7 @@ public abstract class LocalAssemblyEngine {
* If false, we will only write out a region around the reference source
*/
private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true;
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 10;
private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30;
protected final int numBestHaplotypesPerGraph;
@ -115,9 +110,9 @@ public abstract class LocalAssemblyEngine {
* @param refHaplotype the reference haplotype
* @return a non-null list of reads
*/
protected abstract List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype, List<Haplotype> activeAlleleHaplotypes);
protected abstract List<AssemblyResult> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype, List<Haplotype> activeAlleleHaplotypes);
protected List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype) {
protected List<AssemblyResult> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype) {
return assemble(reads, refHaplotype, Collections.<Haplotype>emptyList());
}
@ -145,7 +140,6 @@ public abstract class LocalAssemblyEngine {
// create the list of artificial haplotypes that should be added to the graph for GGA mode
final List<Haplotype> activeAlleleHaplotypes = createActiveAlleleHaplotypes(refHaplotype, activeAllelesToGenotype, activeRegion.getExtendedLoc());
// error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them
final List<GATKSAMRecord> correctedReads;
if (readErrorCorrector != null) {
@ -154,20 +148,31 @@ public abstract class LocalAssemblyEngine {
// and we only want the read-error corrected reads for graph building.
readErrorCorrector.addReadsToKmers(activeRegion.getReads());
correctedReads = new ArrayList<>(readErrorCorrector.correctReads(activeRegion.getReads()));
} else {
correctedReads = activeRegion.getReads();
}
else correctedReads = activeRegion.getReads();
final List<SeqGraph> nonRefGraphs = new LinkedList<>();
// create the graphs by calling our subclass assemble method
final List<SeqGraph> graphs = assemble(correctedReads, refHaplotype, activeAlleleHaplotypes);
// do some QC on the graphs
for ( final SeqGraph graph : graphs ) { sanityCheckGraph(graph, refHaplotype); }
for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, activeAlleleHaplotypes) ) {
if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) {
// do some QC on the graph
sanityCheckGraph(result.getGraph(), refHaplotype);
// add it to graphs with meaningful non-reference features
nonRefGraphs.add(result.getGraph());
}
}
// print the graphs if the appropriate debug option has been turned on
if ( graphWriter != null ) { printGraphs(graphs); }
if ( graphWriter != null ) { printGraphs(nonRefGraphs); }
// find the best paths in the graphs and return them as haplotypes
return findBestPaths( graphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() );
if ( nonRefGraphs.isEmpty() ) {
// we couldn't assemble any meaningful graphs, so return just the reference haplotype
return Collections.singletonList(refHaplotype);
} else {
// find the best paths in the graphs and return them as haplotypes
return findBestPaths( nonRefGraphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() );
}
}
/**
@ -288,7 +293,7 @@ public abstract class LocalAssemblyEngine {
}
}
protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
protected AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) {
printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot"));
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
@ -296,9 +301,7 @@ public abstract class LocalAssemblyEngine {
printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot"));
// now go through and prune the graph, removing vertices no longer connected to the reference chain
// IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
// edges to maintain graph connectivity.
seqGraph.pruneGraph(pruneFactor);
seqGraph.removeSingletonOrphanVertices();
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot"));
@ -309,7 +312,7 @@ public abstract class LocalAssemblyEngine {
// happen in cases where for example the reference somehow manages to acquire a cycle, or
// where the entire assembly collapses back into the reference sequence.
if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
return null;
return new AssemblyResult(AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE, seqGraph);
seqGraph.removePathsNotConnectedToRef();
seqGraph.simplifyGraph();
@ -324,7 +327,7 @@ public abstract class LocalAssemblyEngine {
}
printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot"));
return seqGraph;
return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph);
}
/**

View File

@ -0,0 +1,72 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
/**
* Holds information about a genotype call of a single sample reference vs. any non-ref event
*
* User: depristo
* Date: 6/21/13
* Time: 12:58 PM
* To change this template use File | Settings | File Templates.
*/
final class RefVsAnyResult {
/**
* The genotype likelihoods for ref/ref ref/non-ref non-ref/non-ref
*/
final double[] genotypeLikelihoods = new double[3];
/**
* AD field value for ref / non-ref
*/
final int[] AD_Ref_Any = new int[2];
/**
* @return Get the DP (sum of AD values)
*/
public int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; }
}

View File

@ -0,0 +1,476 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
import org.broadinstitute.sting.utils.haplotypeBAMWriter.ReadDestination;
import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
import org.broadinstitute.variant.vcf.VCFHeaderLine;
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
import org.broadinstitute.variant.vcf.VCFSimpleHeaderLine;
import java.io.File;
import java.util.*;
/**
* Code for estimating the reference confidence
*
* This code can estimate the probability that the data for a single sample is consistent with a
* well-determined REF/REF diploid genotype.
*
* User: depristo
* Date: 6/21/13
* Time: 12:52 PM
*/
public class ReferenceConfidenceModel {
public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF";
public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site
public final static String INDEL_INFORMATIVE_DEPTH = "CD";
private final GenomeLocParser genomeLocParser;
private final Set<String> samples;
private final SAMFileHeader header; // TODO -- really shouldn't depend on this
private final int indelInformativeDepthIndelSize;
private final static boolean WRITE_DEBUGGING_BAM = false;
private final SAMFileWriter debuggingWriter;
/**
* Create a new ReferenceConfidenceModel
*
* @param genomeLocParser how we create genome locs
* @param samples the list of all samples we'll be considering with this model
* @param header the SAMFileHeader describing the read information (used for debugging)
* @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths
*/
public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser,
final Set<String> samples,
final SAMFileHeader header,
final int indelInformativeDepthIndelSize) {
if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
if ( samples == null ) throw new IllegalArgumentException("samples cannot be null");
if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty");
if ( header == null ) throw new IllegalArgumentException("header cannot be empty");
if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize);
this.genomeLocParser = genomeLocParser;
this.samples = samples;
this.header = header;
this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize;
if ( WRITE_DEBUGGING_BAM ) {
final SAMFileWriterFactory factory = new SAMFileWriterFactory();
factory.setCreateIndex(true);
debuggingWriter = factory.makeBAMWriter(header, false, new File("refCalc.bam"));
} else {
debuggingWriter = null;
}
}
/**
* Get the VCF header lines to include when emitting reference confidence values via calculateRefConfidence
* @return a non-null set of VCFHeaderLines
*/
public Set<VCFHeaderLine> getVCFHeaderLines() {
final Set<VCFHeaderLine> headerLines = new LinkedHashSet<>();
headerLines.add(new VCFSimpleHeaderLine("ALT", NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize));
return headerLines;
}
/**
* Close down this reference model, closing down any debugging information opened during execution
*/
public void close() {
if ( debuggingWriter != null ) debuggingWriter.close();
}
/**
* Calculate the reference confidence for a single sample given the its read data
*
* Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing
* detailed information about the certainty that the sample is hom-ref for each base in the region.
*
*
*
* @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc()
* @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the
* stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype.
* @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc())
* @param activeRegion the active region we want to get the reference confidence over
* @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes
* @param variantCalls calls made in this region. The return result will contain any variant call in this list in the
* correct order by genomic position, and any variant in this list will stop us emitting a ref confidence
* under any position is covers (for snps that 1 bp, but for deletion its the entire ref span)
* @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence
* contexts as well as calls from variantCalls if any were provided
*/
public List<VariantContext> calculateRefConfidence(final Haplotype refHaplotype,
final Collection<Haplotype> calledHaplotypes,
final GenomeLoc paddedReferenceLoc,
final ActiveRegion activeRegion,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
final List<VariantContext> variantCalls) {
if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null");
if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null");
if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype");
if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null");
if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null");
if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null");
if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size());
if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different");
final GenomeLoc refSpan = activeRegion.getLocation();
final List<ReadBackedPileup> refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, refSpan, stratifiedReadMap);
final byte[] ref = refHaplotype.getBases();
final List<VariantContext> results = new ArrayList<>(refSpan.size());
final String sampleName = stratifiedReadMap.keySet().iterator().next();
final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart();
for ( final ReadBackedPileup pileup : refPileups ) {
final GenomeLoc curPos = pileup.getLocation();
final int offset = curPos.getStart() - refSpan.getStart();
final VariantContext overlappingSite = getOverlappingVariantContext(curPos, variantCalls);
if ( overlappingSite != null ) {
// we have some overlapping site, add it to the list of positions
if ( overlappingSite.getStart() == curPos.getStart() )
results.add(overlappingSite);
} else {
// otherwise emit a reference confidence variant context
final int refOffset = offset + globalRefOffset;
final byte refBase = ref[refOffset];
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null);
final Allele refAllele = Allele.create(refBase, true);
final List<Allele> refSiteAlleles = Arrays.asList(refAllele, NON_REF_SYMBOLIC_ALLELE);
final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles);
final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele));
gb.AD(homRefCalc.AD_Ref_Any);
gb.DP(homRefCalc.getDP());
// genotype likelihood calculation
final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods);
final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize);
final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads);
// now that we have the SNP and indel GLs, we take the one with the least confidence,
// as this is the most conservative estimate of our certainty that we are hom-ref.
// For example, if the SNP PLs are 0,10,100 and the indel PLs are 0,100,1000
// we are very certain that there's no indel here, but the SNP confidence imply that we are
// far less confident that the ref base is actually the only thing here. So we take 0,10,100
// as our GLs for the site.
final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs);
gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF)));
gb.PL(leastConfidenceGLs.getAsPLs());
gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads);
vcb.genotypes(gb.make());
results.add(vcb.make());
// logger.info(" => VariantContext " + vcb.make());
}
}
return results;
}
/**
* Get the GenotypeLikelihoods with the least strong corresponding GQ value
* @param gl1 first to consider (cannot be null)
* @param gl2 second to consider (cannot be null)
* @return gl1 or gl2, whichever has the worst GQ
*/
protected final GenotypeLikelihoods getGLwithWorstGQ(final GenotypeLikelihoods gl1, final GenotypeLikelihoods gl2) {
return gl1.getLog10GQ(GenotypeType.HOM_REF) > gl2.getLog10GQ(GenotypeType.HOM_REF) ? gl1 : gl2;
}
/**
* Get indel PLs corresponding to seeing N nIndelInformativeReads at this site
*
* @param nInformativeReads the number of reads that inform us about being ref without an indel at this site
* @return non-null GenotypeLikelihoods given N
*/
protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) {
// TODO -- optimization -- this could easily be optimized with some caching
final double homRef = 0.0;
final double het = - LOG10_2 * nInformativeReads;
final double homVar = INDEL_ERROR_RATE * nInformativeReads;
return GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar});
}
private final static double LOG10_2 = Math.log10(2);
private final static double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
/**
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
*
* @param pileup the read backed pileup containing the data we want to evaluate
* @param refBase the reference base at this pileup position
* @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation
* @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
* @return a RefVsAnyResult genotype call
*/
public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) {
final RefVsAnyResult result = new RefVsAnyResult();
for( final PileupElement p : pileup ) {
final byte qual = p.getQual();
if( p.isDeletion() || qual > minBaseQual) {
int AA = 0; final int AB = 1; int BB = 2;
if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
AA = 2;
BB = 0;
if( hqSoftClips != null && p.isNextToSoftClip() ) {
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
}
result.AD_Ref_Any[1]++;
} else {
result.AD_Ref_Any[0]++;
}
result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
result.genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
}
}
return result;
}
/**
* Get a list of pileups that span the entire active region span, in order, one for each position
*/
private List<ReadBackedPileup> getPileupsOverReference(final Haplotype refHaplotype,
final Collection<Haplotype> calledHaplotypes,
final GenomeLoc paddedReferenceLoc,
final GenomeLoc activeRegionSpan,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO");
final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest);
writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves
writer.writeReadsAlignedToHaplotypes(calledHaplotypes.isEmpty() ? Collections.singleton(refHaplotype) : calledHaplotypes, paddedReferenceLoc, stratifiedReadMap);
final List<GATKSAMRecord> realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads());
if ( debuggingWriter != null )
for ( final GATKSAMRecord read : realignedReads )
debuggingWriter.addAlignment(read);
final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING,
false, genomeLocParser, samples, false);
final List<ReadBackedPileup> pileups = new LinkedList<>();
final int startPos = activeRegionSpan.getStart();
AlignmentContext next = libs.advanceToLocus(startPos, true);
for ( int curPos = startPos; curPos <= activeRegionSpan.getStop(); curPos++ ) {
if ( next != null && next.getLocation().getStart() == curPos ) {
pileups.add(next.getBasePileup());
next = libs.hasNext() ? libs.next() : null;
} else {
// no data, so we create empty pileups
pileups.add(new ReadBackedPileupImpl(genomeLocParser.createGenomeLoc(activeRegionSpan.getContig(), curPos)));
}
}
return pileups;
}
/**
* Return the rightmost variant context in maybeOverlapping that overlaps curPos
*
* @param curPos non-null genome loc
* @param maybeOverlapping a collection of variant contexts that might overlap curPos
* @return a VariantContext, or null if none overlaps
*/
protected final VariantContext getOverlappingVariantContext(final GenomeLoc curPos, final Collection<VariantContext> maybeOverlapping) {
VariantContext overlaps = null;
for ( final VariantContext vc : maybeOverlapping ) {
if ( genomeLocParser.createGenomeLoc(vc).overlapsP(curPos) ) {
if ( overlaps == null || vc.getStart() > overlaps.getStart() ) {
overlaps = vc;
}
}
}
return overlaps;
}
/**
* Compute the sum of mismatching base qualities for readBases aligned to refBases at readStart / refStart
* assuming no insertions or deletions in the read w.r.t. the reference
*
* @param readBases non-null bases of the read
* @param readQuals non-null quals of the read
* @param readStart the starting position of the read (i.e., that aligns it to a position in the reference)
* @param refBases the reference bases
* @param refStart the offset into refBases that aligns to the readStart position in readBases
* @param maxSum if the sum goes over this value, return immediately
* @return the sum of quality scores for readBases that mismatch their corresponding ref bases
*/
protected final int sumMismatchingQualities(final byte[] readBases,
final byte[] readQuals,
final int readStart,
final byte[] refBases,
final int refStart,
final int maxSum) {
final int n = Math.min(readBases.length - readStart, refBases.length - refStart);
int sum = 0;
for ( int i = 0; i < n; i++ ) {
final byte readBase = readBases[readStart + i];
final byte refBase = refBases[refStart + i];
if ( readBase != refBase ) {
sum += readQuals[readStart + i];
if ( sum > maxSum )
return sum;
}
}
return sum;
}
/**
* Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart
*
* @param readBases non-null bases of the read
* @param readQuals non-null quals of the read
* @param readStart the starting position of the read (i.e., that aligns it to a position in the reference)
* @param refBases the reference bases
* @param refStart the offset into refBases that aligns to the readStart position in readBases
* @param maxIndelSize the max indel size to consider for the read to be informative
* @return true if read can eliminate the possibility that there's an indel of size <= maxIndelSize segregating at refStart
*/
protected boolean isReadInformativeAboutIndelsOfSize(final byte[] readBases,
final byte[] readQuals,
final int readStart,
final byte[] refBases,
final int refStart,
final int maxIndelSize) {
// todo -- fast exit when n bases left < maxIndelSize
final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE);
// consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion
// or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores
for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) {
for ( final boolean checkInsertion : Arrays.asList(true, false) ) {
final int readI, refI;
if ( checkInsertion ) {
readI = readStart + indelSize;
refI = refStart;
} else {
readI = readStart;
refI = refStart + indelSize;
}
final int score = sumMismatchingQualities(readBases, readQuals, readI, refBases, refI, baselineMMSum);
if ( score <= baselineMMSum )
return false;
}
}
return true;
}
/**
* Calculate the number of indel informative reads at pileup
*
* @param pileup a pileup
* @param pileupOffsetIntoRef the position of the pileup in the reference
* @param ref the ref bases
* @param maxIndelSize maximum indel size to consider in the informativeness calculation
* @return an integer >= 0
*/
protected final int calcNIndelInformativeReads(final ReadBackedPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) {
int nInformative = 0;
for ( final PileupElement p : pileup ) {
final GATKSAMRecord read = p.getRead();
final int offset = p.getOffset();
// doesn't count as evidence
if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() )
continue;
// todo -- this code really should handle CIGARs directly instead of relying on the above tests
if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize))
nInformative++;
}
return nInformative;
}
/**
* Create a reference haplotype for an active region
*
* @param activeRegion the active region
* @param refBases the ref bases
* @param paddedReferenceLoc the location spanning of the refBases -- can be longer than activeRegion.getLocation()
* @return a reference haplotype
*/
public static Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final byte[] refBases, final GenomeLoc paddedReferenceLoc) {
final Haplotype refHaplotype = new Haplotype(refBases, true);
final int alignmentStart = activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart();
if ( alignmentStart < 0 ) throw new IllegalStateException("Bad alignment start in createReferenceHaplotype " + alignmentStart);
refHaplotype.setAlignmentStartHapwrtRef(alignmentStart);
final Cigar c = new Cigar();
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
refHaplotype.setCigar(c);
return refHaplotype;
}
}

View File

@ -71,7 +71,7 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
private final int kmerSize;
/**
* Construct a DeBruijnGraph with kmerSize
* Construct a TestGraph with kmerSize
* @param kmerSize
*/
public BaseGraph(final int kmerSize, final EdgeFactory<V,E> edgeFactory) {
@ -472,28 +472,11 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
}
/**
* Prune all edges from this graph that have multiplicity <= pruneFactor and remove all orphaned singleton vertices as well
*
* @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
*/
public void pruneGraph( final int pruneFactor ) {
final List<E> edgesToRemove = new ArrayList<>();
for( final E e : edgeSet() ) {
if( e.getPruningMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
edgesToRemove.add(e);
}
}
removeAllEdges(edgesToRemove);
removeSingletonOrphanVertices();
}
/**
* Prune all chains from this graph where all edges in the path have multiplicity <= pruneFactor
* Prune all chains from this graph where any edge in the path has multiplicity < pruneFactor
*
* @see LowWeightChainPruner for more information
*
* @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
* @param pruneFactor all edges with multiplicity < this factor that aren't ref edges will be removed
*/
public void pruneLowWeightChains( final int pruneFactor ) {
final LowWeightChainPruner<V,E> pruner = new LowWeightChainPruner<>(pruneFactor);
@ -503,7 +486,7 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
/**
* Remove all vertices in the graph that have in and out degree of 0
*/
protected void removeSingletonOrphanVertices() {
public void removeSingletonOrphanVertices() {
// Run through the graph and clean up singular orphaned nodes
final List<V> verticesToRemove = new LinkedList<>();
for( final V v : vertexSet() ) {

View File

@ -96,7 +96,7 @@ public class LowWeightChainPruner<V extends BaseVertex, E extends BaseEdge> {
}
/**
* Traverse the edges in the path and determine if any are either ref edges or have weight above
* Traverse the edges in the path and determine if any are either ref edges or have weight above or equal to
* the pruning factor and should therefore not be pruned away.
*
* @param path the path in question

View File

@ -49,17 +49,16 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import com.google.java.contract.Ensures;
import org.jgrapht.EdgeFactory;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* A DeBruijn kmer graph
* A Test kmer graph
*
* User: rpoplin
* Date: 2/6/13
*/
public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex, BaseEdge> {
public final class TestGraph extends BaseGraph<DeBruijnVertex, BaseEdge> {
/**
* Edge factory that creates non-reference multiplicity 1 edges
*/
@ -71,33 +70,20 @@ public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex, BaseEdge> {
}
/**
* Create an empty DeBruijnGraph with default kmer size
* Create an empty TestGraph with default kmer size
*/
public DeBruijnGraph() {
public TestGraph() {
this(11);
}
/**
* Create an empty DeBruijnGraph with kmer size
* Create an empty TestGraph with kmer size
* @param kmerSize kmer size, must be >= 1
*/
public DeBruijnGraph(int kmerSize) {
public TestGraph(int kmerSize) {
super(kmerSize, new MyEdgeFactory());
}
/**
* Pull kmers out of the given long sequence and throw them on in the graph
* @param sequence byte array holding the sequence with which to build the assembly graph
* @param KMER_LENGTH the desired kmer length to use
* @param isRef if true the kmers added to the graph will have reference edges linking them
*/
public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) {
if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); }
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1);
}
}
/**
* Add edge to assembly graph connecting the two kmers
@ -129,7 +115,7 @@ public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex, BaseEdge> {
@Ensures({"result != null"})
public SeqGraph convertToSequenceGraph() {
final SeqGraph seqGraph = new SeqGraph(getKmerSize());
final Map<DeBruijnVertex, SeqVertex> vertexMap = new HashMap<DeBruijnVertex, SeqVertex>();
final Map<DeBruijnVertex, SeqVertex> vertexMap = new HashMap<>();
// create all of the equivalent seq graph vertices
for ( final DeBruijnVertex dv : vertexSet() ) {

View File

@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResult;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.MathUtils;
@ -98,32 +99,33 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
this.justReturnRawGraph = justReturnRawGraph;
}
private void addResult(final List<AssemblyResult> results, final AssemblyResult maybeNullResult) {
if ( maybeNullResult != null )
results.add(maybeNullResult);
}
@Override
public List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes) {
final List<SeqGraph> graphs = new LinkedList<>();
public List<AssemblyResult> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes) {
final List<AssemblyResult> results = new LinkedList<>();
// first, try using the requested kmer sizes
for ( final int kmerSize : kmerSizes ) {
final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles);
if ( graph != null )
graphs.add(graph);
addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles));
}
// if none of those worked, iterate over larger sizes if allowed to do so
if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) {
if ( results.isEmpty() && !dontIncreaseKmerSizesForCycles ) {
int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE;
int numIterations = 1;
while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
while ( results.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
// on the last attempt we will allow low complexity graphs
final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT);
if ( graph != null )
graphs.add(graph);
addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT));
kmerSize += KMER_SIZE_ITERATION_INCREASE;
numIterations++;
}
}
return graphs;
return results;
}
/**
@ -136,11 +138,16 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
* @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
* @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
*/
protected SeqGraph createGraph(final List<GATKSAMRecord> reads,
final Haplotype refHaplotype,
final int kmerSize,
final List<Haplotype> activeAlleleHaplotypes,
final boolean allowLowComplexityGraphs) {
protected AssemblyResult createGraph(final List<GATKSAMRecord> reads,
final Haplotype refHaplotype,
final int kmerSize,
final List<Haplotype> activeAlleleHaplotypes,
final boolean allowLowComplexityGraphs) {
if ( refHaplotype.length() < kmerSize ) {
// happens in cases where the assembled region is just too small
return new AssemblyResult(AssemblyResult.Status.FAILED, null);
}
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples);
// add the reference sequence to the graph
@ -183,7 +190,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
// we can recover them by merging some N bases from the chain back into the reference
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(pruneFactor);
// remove all heading and trailing paths
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
@ -193,14 +200,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
if ( justReturnRawGraph ) return initialSeqGraph;
if ( justReturnRawGraph ) return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph);
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph;
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
final AssemblyResult.Status status = cleaned.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(cleaned.getGraph()) ? AssemblyResult.Status.FAILED : cleaned.getStatus();
return new AssemblyResult(status, cleaned.getGraph());
}
/**

View File

@ -58,6 +58,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman;
import org.jgrapht.EdgeFactory;
import org.jgrapht.alg.CycleDetector;
@ -93,6 +94,9 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
private final static boolean WRITE_GRAPH = false;
private final static boolean DEBUG_NON_UNIQUE_CALC = false;
private final static int MAX_CIGAR_COMPLEXITY = 3;
private final static int MIN_DANGLING_TAIL_LENGTH = 5; // SNP + 3 stabilizing nodes + the LCA
/** for debugging info printing */
private static int counter = 0;
@ -276,13 +280,14 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
* Attempt to attach vertex with out-degree == 0 to the graph
*
* @param vertex the vertex to recover
* @param pruneFactor the prune factor to use in ignoring chain pieces
* @return 1 if we successfully recovered the vertex and 0 otherwise
*/
protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) {
protected int recoverDanglingChain(final MultiDeBruijnVertex vertex, final int pruneFactor) {
if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0");
// generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex);
final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex, pruneFactor);
// if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path
if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) )
@ -301,13 +306,14 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
protected boolean cigarIsOkayToMerge(final Cigar cigar) {
final List<CigarElement> elements = cigar.getCigarElements();
final int numElements = elements.size();
// don't allow more than a couple of different ops
if ( elements.size() > 3 )
if ( numElements > MAX_CIGAR_COMPLEXITY )
return false;
// the last element must be an M
if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M )
if ( elements.get(numElements - 1).getOperator() != CigarOperator.M )
return false;
// TODO -- do we want to check whether the Ms mismatch too much also?
@ -334,7 +340,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
return 0;
final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0);
final int refIndexToMerge = lastRefIndex - matchingSuffix + 1;
final boolean firstElementIsDeletion = elements.get(0).getOperator() == CigarOperator.D;
final int refIndexToMerge = lastRefIndex - matchingSuffix + 1 + (firstElementIsDeletion ? 1 : 0); // need to push down if SW tells us to remove the LCA
addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1));
return 1;
}
@ -344,13 +351,14 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
* provided vertex is the sink) and the reference path.
*
* @param vertex the sink of the dangling tail
* @param pruneFactor the prune factor to use in ignoring chain pieces
* @return a SmithWaterman object which can be null if no proper alignment could be generated
*/
protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex) {
protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor) {
// find the lowest common ancestor path between vertex and the reference sink if available
final List<MultiDeBruijnVertex> altPath = findPathToLowestCommonAncestorOfReference(vertex);
if ( altPath == null || isRefSource(altPath.get(0)) )
final List<MultiDeBruijnVertex> altPath = findPathToLowestCommonAncestorOfReference(vertex, pruneFactor);
if ( altPath == null || isRefSource(altPath.get(0)) || altPath.size() < MIN_DANGLING_TAIL_LENGTH )
return null;
// now get the reference path from the LCA
@ -361,24 +369,32 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
final byte[] altBases = getBasesForPath(altPath);
// run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting)
final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWPairwiseAlignment.OVERHANG_STRATEGY.INDEL);
final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWParameterSet.STANDARD_NGS, SWPairwiseAlignment.OVERHANG_STRATEGY.LEADING_INDEL);
return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar()));
}
/**
* Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex
* Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex.
* Note that nodes are excluded if their pruning weight is less than the pruning factor.
*
* @param vertex the original vertex
* @param pruneFactor the prune factor to use in ignoring chain pieces
* @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
* has an ancestor with multiple incoming edges before hitting the reference path
*/
protected List<MultiDeBruijnVertex> findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) {
protected List<MultiDeBruijnVertex> findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) {
final LinkedList<MultiDeBruijnVertex> path = new LinkedList<>();
MultiDeBruijnVertex v = vertex;
while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) {
path.addFirst(v);
v = getEdgeSource(incomingEdgeOf(v));
final MultiSampleEdge edge = incomingEdgeOf(v);
// if it has too low a weight, don't use it (or previous vertexes) for the path
if ( edge.getPruningMultiplicity() < pruneFactor )
path.clear();
// otherwise it is safe to use
else
path.addFirst(v);
v = getEdgeSource(edge);
}
path.addFirst(v);
@ -453,7 +469,12 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
return nonUniqueKmers.size() * 4 > uniqueKmers.size();
}
public void recoverDanglingTails() {
/**
* Try to recover dangling tails
*
* @param pruneFactor the prune factor to use in ignoring chain pieces
*/
public void recoverDanglingTails(final int pruneFactor) {
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
int attempted = 0;
@ -461,7 +482,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
for ( final MultiDeBruijnVertex v : vertexSet() ) {
if ( outDegreeOf(v) == 0 && ! isRefNodeAndRefSink(v) ) {
attempted++;
nRecovered += recoverDanglingChain(v);
nRecovered += recoverDanglingChain(v, pruneFactor);
}
}
@ -740,13 +761,12 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
// the first good base is at lastGood, can be -1 if last base was bad
final int start = lastGood;
// the stop base is end - 1 (if we're not at the end of the sequence)
final int stop = end == sequence.length ? sequence.length : end;
final int len = stop - start + 1;
final int len = end - start;
if ( start != -1 && len >= kmerSize ) {
// if the sequence is long enough to get some value out of, add it to the graph
final String name = read.getReadName() + "_" + start + "_" + end;
addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, stop, reducedReadCounts, false);
addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, reducedReadCounts, false);
}
lastGood = -1; // reset the last good base

View File

@ -0,0 +1,298 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.utils.gvcf;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.variant.vcf.*;
import java.util.*;
/**
* Genome-wide VCF writer
*
* User: depristo
* Date: 6/24/13
* Time: 2:51 PM
*/
public class GVCFWriter implements VariantContextWriter {
//
// static VCF field names
//
protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE";
protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP";
protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ";
//
// Final fields initialized in constructor
//
/** Where we'll ultimately write our VCF records */
final private VariantContextWriter underlyingWriter;
final private List<HomRefBlock> GQPartitions;
/** fields updated on the fly during GVCFWriter operation */
int nextAvailableStart = -1;
private String sampleName = null;
private HomRefBlock currentBlock = null;
/**
* Is the proposed GQ partitions well-formed?
*
* @param GQPartitions proposed GQ partitions
* @return a non-null string if something is wrong (string explains issue)
*/
protected static List<HomRefBlock> parsePartitions(final List<Integer> GQPartitions) {
if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null");
if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty");
final List<HomRefBlock> result = new LinkedList<>();
int lastThreshold = 0;
for ( final Integer value : GQPartitions ) {
if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer");
if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value);
if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value);
result.add(new HomRefBlock(lastThreshold, value));
lastThreshold = value;
}
result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE));
return result;
}
/**
* Create a new GVCF writer
*
* Should be a non-empty list of boundaries. For example, suppose this variable is
*
* [A, B, C]
*
* We would partition our hom-ref sites into the following bands:
*
* X < A
* A <= X < B
* B <= X < C
* X >= C
*
* @param underlyingWriter the ultimate destination of the GVCF records
* @param GQPartitions a well-formed list of GQ partitions
*/
public GVCFWriter(final VariantContextWriter underlyingWriter, final List<Integer> GQPartitions) {
if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null");
this.underlyingWriter = underlyingWriter;
this.GQPartitions = parsePartitions(GQPartitions);
}
/**
* Write the VCF header
*
* Adds standard GVCF fields to the header
*
* @param header a non-null header
*/
@Override
public void writeHeader(VCFHeader header) {
if ( header == null ) throw new IllegalArgumentException("header cannot be null");
header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block"));
header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block"));
header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block"));
for ( final HomRefBlock partition : GQPartitions ) {
header.addMetaDataLine(partition.toVCFHeaderLine());
}
underlyingWriter.writeHeader(header);
}
/**
* Close this GVCF writer. Finalizes any pending hom-ref blocks and emits those to the underlyingWriter as well
*/
@Override
public void close() {
close(true);
}
/**
* Horrible work around because there's no clean way to get our VCFWriter closed by the GATK
*
* If closeUnderlyingWriter is true, then we'll close the underlying writer, otherwise we'll leave it open
* so the GATK closes it later
*
* @param closeUnderlyingWriter should we leave the underlying writer open or closed?
*/
public void close(final boolean closeUnderlyingWriter) {
emitCurrentBlock();
if ( closeUnderlyingWriter ) underlyingWriter.close();
}
/**
* Add hom-ref site from vc to this gVCF hom-ref state tracking, emitting any pending states if appropriate
*
* @param vc a non-null VariantContext
* @param g a non-null genotype from VariantContext
* @return a VariantContext to be emitted, or null if non is appropriate
*/
protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) {
if ( nextAvailableStart != -1 && vc.getStart() <= nextAvailableStart ) {
// don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions)
return null;
} else if ( currentBlock == null ) {
currentBlock = createNewBlock(vc, g);
return null;
} else if ( currentBlock.withinBounds(g.getGQ()) ) {
currentBlock.add(vc.getStart(), g);
return null;
} else {
final VariantContext result = blockToVCF(currentBlock);
currentBlock = createNewBlock(vc, g);
return result;
}
}
/**
* Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null
*/
private void emitCurrentBlock() {
if ( currentBlock != null ) {
// there's actually some work to do
underlyingWriter.add(blockToVCF(currentBlock));
currentBlock = null;
}
}
/**
* Convert a HomRefBlock into a VariantContext
*
* @param block the block to convert
* @return a VariantContext representing the gVCF encoding for this block
*/
private VariantContext blockToVCF(final HomRefBlock block) {
if ( block == null ) throw new IllegalArgumentException("block cannot be null");
final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC());
vcb.attributes(new HashMap<String, Object>(2)); // clear the attributes
vcb.stop(block.getStop());
vcb.attribute(VCFConstants.END_KEY, block.getStop());
vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize());
// create the single Genotype with GQ and DP annotations
final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef()));
gb.noAD().noPL().noAttributes(); // clear all attributes
gb.GQ(block.getMedianGQ());
gb.DP(block.getMedianDP());
gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP());
gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ());
return vcb.genotypes(gb.make()).make();
}
/**
* Helper function to create a new HomRefBlock from a variant context and current genotype
*
* @param vc the VariantContext at the site where want to start the band
* @param g the genotype of the sample from vc that should be used to initialize the block
* @return a newly allocated and initialized block containing g already
*/
private HomRefBlock createNewBlock(final VariantContext vc, final Genotype g) {
// figure out the GQ limits to use based on the GQ of g
HomRefBlock partition = null;
for ( final HomRefBlock maybePartition : GQPartitions ) {
if ( maybePartition.withinBounds(g.getGQ()) ) {
partition = maybePartition;
break;
}
}
if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition);
// create the block, add g to it, and return it for use
final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound());
block.add(vc.getStart(), g);
return block;
}
/**
* Add a VariantContext to this writer for emission
*
* Requires that the VC have exactly one genotype
*
* @param vc a non-null VariantContext
*/
@Override
public void add(VariantContext vc) {
if ( vc == null ) throw new IllegalArgumentException("vc cannot be null");
if ( sampleName == null )
sampleName = vc.getGenotype(0).getSampleName();
if ( ! vc.hasGenotypes() ) {
throw new IllegalArgumentException("GVCF assumes that the VariantContext has genotypes");
} else if ( vc.getGenotypes().size() != 1 ) {
throw new IllegalArgumentException("GVCF assumes that the VariantContext has exactly one genotype but saw " + vc.getGenotypes().size());
} else {
if ( currentBlock != null && ! currentBlock.isContiguous(vc) ) {
// we've made a non-contiguous step (across interval, onto another chr), so finalize
emitCurrentBlock();
}
final Genotype g = vc.getGenotype(0);
if ( g.isHomRef() ) {
// create bands
final VariantContext maybeCompletedBand = addHomRefSite(vc, g);
if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand);
} else {
// g is variant, so flush the bands and emit vc
emitCurrentBlock();
nextAvailableStart = vc.getEnd();
underlyingWriter.add(vc);
}
}
}
}

View File

@ -0,0 +1,169 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.utils.gvcf;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFHeaderLine;
import java.util.ArrayList;
import java.util.List;
/**
* Helper class for calculating a GQ band in the GVCF writer
*
* A band contains GQ and DP values for a contiguous stretch of hom-ref genotypes,
* and provides summary information about the entire block of genotypes.
*
* Genotypes within the HomRefBlock are restricted to hom-ref genotypes within a band of GQ scores
*
* User: depristo
* Date: 6/25/13
* Time: 9:41 AM
*/
final class HomRefBlock {
private final VariantContext startingVC;
int stop;
private final int minGQ, maxGQ;
private List<Integer> GQs = new ArrayList<>(100);
private List<Integer> DPs = new ArrayList<>(100);
private final Allele ref;
/**
* Create a new HomRefBlock
*
* @param startingVC the VariantContext that starts this band (for starting position information)
* @param minGQ the minGQ (inclusive) to use in this band
* @param maxGQ the maxGQ (exclusive) to use in this band
*/
public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) {
if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null");
if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ);
this.startingVC = startingVC;
this.stop = getStart() - 1;
this.ref = startingVC.getReference();
this.minGQ = minGQ;
this.maxGQ = maxGQ;
}
/**
* Create a new HomRefBlock only for doing bounds checking
*
* @param minGQ the minGQ (inclusive) to use in this band
* @param maxGQ the maxGQ (exclusive) to use in this band
*/
public HomRefBlock(int minGQ, int maxGQ) {
if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ);
this.startingVC = null;
this.stop = -1;
this.ref = null;
this.minGQ = minGQ;
this.maxGQ = maxGQ;
}
/**
* Add information from this Genotype to this band
* @param g a non-null Genotype with GQ and DP attributes
*/
public void add(final int pos, final Genotype g) {
if ( g == null ) throw new IllegalArgumentException("g cannot be null");
if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field");
if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field");
if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop);
stop = pos;
GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission
DPs.add(g.getDP());
}
/**
* Is the GQ value within the bounds of this GQ (GQ >= minGQ && GQ < maxGQ)
* @param GQ the GQ value to test
* @return true if within bounds, false otherwise
*/
public boolean withinBounds(final int GQ) {
return GQ >= minGQ && GQ < maxGQ;
}
/** Get the min GQ observed within this band */
public int getMinGQ() { return MathUtils.arrayMin(GQs); }
/** Get the median GQ observed within this band */
public int getMedianGQ() { return MathUtils.median(GQs); }
/** Get the min DP observed within this band */
public int getMinDP() { return MathUtils.arrayMin(DPs); }
/** Get the median DP observed within this band */
public int getMedianDP() { return MathUtils.median(DPs); }
protected int getGQUpperBound() { return maxGQ; }
protected int getGQLowerBound() { return minGQ; }
public boolean isContiguous(final VariantContext vc) {
return vc.getEnd() == getStop() + 1 && startingVC.getChr().equals(vc.getChr());
}
public VariantContext getStartingVC() { return startingVC; }
public int getStart() { return startingVC.getStart(); }
public int getStop() { return stop; }
public Allele getRef() { return ref; }
public int getSize() { return getStop() - getStart() + 1; }
@Override
public String toString() {
return "HomRefBlock{" +
"minGQ=" + minGQ +
", maxGQ=" + maxGQ +
'}';
}
public VCFHeaderLine toVCFHeaderLine() {
return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)");
}
}

View File

@ -46,11 +46,10 @@
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
@ -67,17 +66,17 @@ import java.util.*;
* Time: 1:50 PM
*/
class AllHaplotypeBAMWriter extends HaplotypeBAMWriter {
public AllHaplotypeBAMWriter(final SAMFileWriter bamWriter) {
super(bamWriter);
public AllHaplotypeBAMWriter(final ReadDestination destination) {
super(destination);
}
/**
* {@inheritDoc}
*/
@Override
public void writeReadsAlignedToHaplotypes(final List<Haplotype> haplotypes,
public void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
final GenomeLoc paddedReferenceLoc,
final List<Haplotype> bestHaplotypes,
final Collection<Haplotype> bestHaplotypes,
final Set<Haplotype> calledHaplotypes,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
writeHaplotypesAsReads(haplotypes, new HashSet<>(bestHaplotypes), paddedReferenceLoc);

View File

@ -68,17 +68,17 @@ import java.util.*;
* Time: 1:50 PM
*/
class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
public CalledHaplotypeBAMWriter(final SAMFileWriter bamWriter) {
super(bamWriter);
public CalledHaplotypeBAMWriter(final ReadDestination destination) {
super(destination);
}
/**
* {@inheritDoc}
*/
@Override
public void writeReadsAlignedToHaplotypes(final List<Haplotype> haplotypes,
public void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
final GenomeLoc paddedReferenceLoc,
final List<Haplotype> bestHaplotypes,
final Collection<Haplotype> bestHaplotypes,
final Set<Haplotype> calledHaplotypes,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes
@ -98,10 +98,8 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
// next, output the interesting reads for each sample aligned against one of the called haplotypes
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
if ( entry.getKey().getMappingQuality() > 0 ) {
final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative());
}
final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative());
}
}
}

View File

@ -46,16 +46,18 @@
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
import net.sf.samtools.*;
import net.sf.samtools.Cigar;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMTag;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import java.util.*;
@ -75,8 +77,8 @@ public abstract class HaplotypeBAMWriter {
protected final static String READ_GROUP_ID = "ArtificialHaplotype";
protected final static String HAPLOTYPE_TAG = "HC";
final SAMFileWriter bamWriter;
final SAMFileHeader bamHeader;
final ReadDestination output;
boolean writeHaplotypesAsWell = true;
/**
* Possible modes for writing haplotypes to BAMs
@ -104,27 +106,10 @@ public abstract class HaplotypeBAMWriter {
* @return a new HaplotypeBAMWriter
*/
public static HaplotypeBAMWriter create(final Type type, final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header) {
if ( header == null ) throw new IllegalArgumentException("header cannot be null");
if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null");
if ( type == null ) throw new IllegalArgumentException("type cannot be null");
// prepare the bam header
final SAMFileHeader bamHeader = new SAMFileHeader();
bamHeader.setSequenceDictionary(header.getSequenceDictionary());
bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
// include the original read groups plus a new artificial one for the haplotypes
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(header.getReadGroups());
final SAMReadGroupRecord rg = new SAMReadGroupRecord(READ_GROUP_ID);
rg.setSample("HC");
rg.setSequencingCenter("BI");
readGroups.add(rg);
bamHeader.setReadGroups(readGroups);
// TODO -- this will be a performance problem at high-scale
stingSAMWriter.setPresorted(false);
stingSAMWriter.writeHeader(bamHeader);
return create(type, stingSAMWriter);
final ReadDestination toBam = new ReadDestination.ToBAM(stingSAMWriter, header, READ_GROUP_ID);
return create(type, toBam);
}
/**
@ -134,16 +119,16 @@ public abstract class HaplotypeBAMWriter {
* may come in out of order during writing
*
* @param type the type of the writer we want to create
* @param writer the destination, must not be null
* @param destination the destination, must not be null
* @return a new HaplotypeBAMWriter
*/
public static HaplotypeBAMWriter create(final Type type, final SAMFileWriter writer) {
if ( writer == null ) throw new IllegalArgumentException("writer cannot be null");
public static HaplotypeBAMWriter create(final Type type, final ReadDestination destination) {
if ( destination == null ) throw new IllegalArgumentException("writer cannot be null");
if ( type == null ) throw new IllegalArgumentException("type cannot be null");
switch ( type ) {
case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(writer);
case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(writer);
case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(destination);
case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(destination);
default: throw new IllegalArgumentException("Unknown type " + type);
}
}
@ -154,11 +139,10 @@ public abstract class HaplotypeBAMWriter {
* Assumes that the header has been fully initialized with a single
* read group READ_GROUP_ID
*
* @param bamWriter our output destination
* @param output our output destination
*/
protected HaplotypeBAMWriter(SAMFileWriter bamWriter) {
this.bamWriter = bamWriter;
this.bamHeader = bamWriter.getFileHeader();
protected HaplotypeBAMWriter(final ReadDestination output) {
this.output = output;
}
/**
@ -170,12 +154,18 @@ public abstract class HaplotypeBAMWriter {
* @param calledHaplotypes a list of the haplotypes at where actually called as non-reference
* @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes
*/
public abstract void writeReadsAlignedToHaplotypes(final List<Haplotype> haplotypes,
public abstract void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
final GenomeLoc paddedReferenceLoc,
final List<Haplotype> bestHaplotypes,
final Collection<Haplotype> bestHaplotypes,
final Set<Haplotype> calledHaplotypes,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap);
public void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
final GenomeLoc paddedReferenceLoc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, haplotypes, new HashSet<>(haplotypes), stratifiedReadMap);
}
/**
* Write out read aligned to haplotype to the BAM file
*
@ -193,7 +183,7 @@ public abstract class HaplotypeBAMWriter {
final boolean isInformative) {
final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative);
if ( alignedToRef != null )
bamWriter.addAlignment(alignedToRef);
output.add(alignedToRef);
}
/**
@ -281,8 +271,9 @@ public abstract class HaplotypeBAMWriter {
protected void writeHaplotypesAsReads(final Collection<Haplotype> haplotypes,
final Set<Haplotype> bestHaplotypes,
final GenomeLoc paddedReferenceLoc) {
for ( final Haplotype haplotype : haplotypes )
writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype));
if ( isWriteHaplotypesAsWell() )
for ( final Haplotype haplotype : haplotypes )
writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype));
}
/**
@ -295,7 +286,7 @@ public abstract class HaplotypeBAMWriter {
private void writeHaplotype(final Haplotype haplotype,
final GenomeLoc paddedRefLoc,
final boolean isAmongBestHaplotypes) {
final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
final GATKSAMRecord record = new GATKSAMRecord(output.getHeader());
record.setReadBases(haplotype.getBases());
record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
@ -307,6 +298,14 @@ public abstract class HaplotypeBAMWriter {
record.setReferenceIndex(paddedRefLoc.getContigIndex());
record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID);
record.setFlags(16);
bamWriter.addAlignment(record);
output.add(record);
}
public boolean isWriteHaplotypesAsWell() {
return writeHaplotypesAsWell;
}
public void setWriteHaplotypesAsWell(boolean writeHaplotypesAsWell) {
this.writeHaplotypesAsWell = writeHaplotypesAsWell;
}
}

View File

@ -0,0 +1,135 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
* Utility class that allows us to easily create destinations for the HaplotypeBAMWriters
*
* User: depristo
* Date: 6/19/13
* Time: 10:19 AM
*/
public abstract class ReadDestination {
public abstract void add(final GATKSAMRecord read);
private final SAMFileHeader bamHeader;
public SAMFileHeader getHeader() {
return bamHeader;
}
protected ReadDestination(final SAMFileHeader header, final String readGroupID) {
// prepare the bam header
if ( header == null ) throw new IllegalArgumentException("header cannot be null");
bamHeader = new SAMFileHeader();
bamHeader.setSequenceDictionary(header.getSequenceDictionary());
bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
// include the original read groups plus a new artificial one for the haplotypes
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(header.getReadGroups());
final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupID);
rg.setSample("HC");
rg.setSequencingCenter("BI");
readGroups.add(rg);
bamHeader.setReadGroups(readGroups);
}
public static class ToBAM extends ReadDestination {
final SAMFileWriter bamWriter;
/**
* Create a ReadDestination that writes to a BAM file
*/
public ToBAM(final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header, final String readGroupID) {
super(header, readGroupID);
if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null");
bamWriter = stingSAMWriter;
stingSAMWriter.setPresorted(false);
stingSAMWriter.writeHeader(getHeader());
}
@Override
public void add(GATKSAMRecord read) {
bamWriter.addAlignment(read);
}
}
public static class ToList extends ReadDestination {
final List<GATKSAMRecord> reads = new LinkedList<>();
/**
* Create a ReadDestination that captures the output reads in a list of reads
*/
public ToList(SAMFileHeader header, String readGroupID) {
super(header, readGroupID);
}
@Override
public void add(GATKSAMRecord read) {
reads.add(read);
}
/**
* Get the reads that have been written to this destination
* @return a non-null list of reads
*/
public List<GATKSAMRecord> getReads() {
return reads;
}
}
}

View File

@ -59,6 +59,9 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
protected static final double INITIAL_CONDITION = Math.pow(2, 1020);
protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION);
// we divide e by 3 because the observed base could have come from any of the non-observed alleles
protected static final double TRISTATE_CORRECTION = 3.0;
private static final int matchToMatch = 0;
private static final int indelToMatch = 1;
private static final int matchToInsertion = 2;
@ -146,7 +149,7 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) );
}
}
}

View File

@ -172,6 +172,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("getting DB tag with HM3", spec);
}
@Test
public void testDBTagWithTwoComps() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf --comp:foo " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1,
Arrays.asList("6afbf05090ae139f53467cf6e0e71cf4"));
executeTest("getting DB tag with 2 comps", spec);
}
@Test
public void testNoQuals() {
WalkerTestSpec spec = new WalkerTestSpec(

View File

@ -48,49 +48,24 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.collections.Pair;
import org.junit.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
public class BiasedDownsamplingIntegrationTest extends WalkerTest {
private final static String baseCommand1 = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommand2 = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:1,000,000-5,000,000";
private final static String baseCommand3 = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:4,000,000-5,000,000";
private final static String baseCommandUG = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:4,000,000-5,000,000";
private final static String baseCommandHC = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:4,000,000-5,000,000" + " --useFilteredReadsForAnnotations";
private final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/";
// --------------------------------------------------------------------------------------------------------------
//
// testing UnifiedGenotyper contamination down-sampling
//
// --------------------------------------------------------------------------------------------------------------
@Test(enabled = false)
public void testContaminationDownsamplingFlat() {
WalkerTestSpec spec = new WalkerTestSpec(
baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contamination 0.20", 1,
Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb"));
executeTest("test contamination_percentage_to_filter 0.20", spec);
}
@Test(enabled = false)
public void testContaminationDownsamplingFlatAndPerSample() {
WalkerTestSpec spec = new WalkerTestSpec(
baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_per_sample_file " + ArtificalBAMLocation + "NA12878.NA19240.contam.txt --contamination_fraction_to_filter 0.10", 1,
Arrays.asList("53395814dd6990448a01a294ccd69bd2"));
executeTest("test contamination_percentage_to_filter per-sample and .20 overall", spec);
}
@Test(enabled = false)
public void testContaminationDownsamplingPerSampleOnly() {
WalkerTestSpec spec = new WalkerTestSpec(
baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contaminationFile " + ArtificalBAMLocation + "NA19240.contam.txt", 1,
Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb"));
executeTest("test contamination_percentage_to_filter per-sample", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
@ -98,150 +73,49 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
@Test(enabled = false)
@Test
private void testDefaultContamination() {
final String bam1 = "NA11918.with.1.NA12842.reduced.bam";
final String bam2 = "NA12842.with.1.NA11918.reduced.bam";
WalkerTestSpec spec = new WalkerTestSpec(
baseCommand2 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s ", 1,
Arrays.asList("e2e5a8dd313f8d7e382e7d49dfac59a2"));
executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " with default downsampling.", spec);
baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination .05 ", 1,
Arrays.asList("b13612312ff991cf40ddc44255e76ecd"));
executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " with .05 downsampling.", spec);
}
private void testFlatContamination(final String bam1, final String bam2, final Double downsampling, final String md5) {
WalkerTestSpec spec = new WalkerTestSpec(
baseCommand2 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination " + downsampling.toString(), 1,
Arrays.asList(md5));
executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " downsampling " + downsampling.toString(), spec);
}
@Test(enabled = false)
public void testFlatContaminationCase1() {
testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "e2e5a8dd313f8d7e382e7d49dfac59a2");
}
@Test(enabled = false)
public void testFlatContaminationCase2() {
testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "549737002f98775fea8f46e7ea174dde");
}
@Test(enabled = false)
public void testFlatContaminationCase3() {
testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "529d82c2a33fcc303a5dc55de2d56979");
}
@Test(enabled = false)
public void testFlatContaminationCase4() {
testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.1, "b5689972fbb7d230a372ee5f0da1c6d7");
}
@Test(enabled = false)
public void testFlatContaminationCase5() {
testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.2, "9dceee2e921b53fbc1ce137a7e0b7b74");
}
@Test(enabled = false)
public void testFlatContaminationCase6() {
testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.3, "d6a74061033503af80dcaea065bfa075");
}
@Test(enabled = false)
public void testFlatContaminationCase7() {
testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "7d1b5efab58a1b8f9d99fcf5af82f15a");
}
@Test(enabled = false)
public void testFlatContaminationCase8() {
testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "a7f8d5c79626aff59d7f426f79d8816e");
}
@Test(enabled = false)
public void testFlatContaminationCase9() {
testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.3, "fcf482398b7c908e3e2d1e4d5da6377b");
}
private void testPerSampleContamination(String bam1, String bam2, String persampleFile, final String md5) {
WalkerTestSpec spec = new WalkerTestSpec(
baseCommand2 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contaminationFile " + persampleFile, 1,
Arrays.asList(md5));
executeTest("test contamination on Artificial Contamination (per-sample) on " + bam1 + " and " + bam2 + " with " + persampleFile, spec);
}
@Test(enabled = false)
public void testPerSampleContaminationCase1() {
testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "e00278527a294833259e9e411728e395");
}
@Test(enabled = false)
public void testPerSampleContaminationCase2() {
testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "a443e793f0b0e2ffce1b751634d706e2");
}
@Test(enabled = false)
public void testPerSampleContaminationCase3() {
testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "e11d83a7815ce757afbcf7689568cb25");
}
@Test(enabled = false)
public void testPerSampleContaminationCase4() {
testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "615042eeeffe042bd1c86279d34f80b6");
}
@Test(enabled = false)
public void testPerSampleContaminationCase5() {
testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "9bc99fc79ca34744bf26cb19ee4ef44d");
}
@Test(enabled = false)
public void testPerSampleContaminationCase6() {
testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "143626fe5fce765d6c997a64f058a813");
}
@Test(enabled = false)
public void testPerSampleContaminationCase7() {
testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "f2593674cef894eda4e0be9cf3158f57");
}
@Test(enabled = false)
public void testPerSampleContaminationCase8() {
testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "fb7ce0740767ae3896b3e552026da1e4");
}
private void testPerSampleEqualsFlat(final String bam1, final String bam2, final String persampleFile, final Double downsampling, final String md5) {
final String command = baseCommand3 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s ";
WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList(md5));
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec);
spec = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList(md5));
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec);
}
// verify that inputing a file with an effectively flat contamination level is equivalent to handing in a flat contamination level
@Test(enabled = false)
public void testPerSampleEqualsFlatContaminationCase1() {
testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0, "");
@DataProvider(name="PerSampleEqualFlatContamBams")
public Object[][] makePerSampleEqualFlatContamBams() {
final List<Object[]> tests = new LinkedList<Object[]>();
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0}) ;
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ;
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ;
return tests.toArray(new Object[][]{});
}
@Test(enabled = false)
public void testPerSampleEqualsFlatContaminationCase2() {
testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15, "");
}
@Test(dataProvider = "PerSampleEqualFlatContamBams")
private void testPerSampleEqualsFlat(final String bam1, final String bam2, final String persampleFile, final Double downsampling) {
final String command = baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s ";
@Test(enabled = false)
public void testPerSampleEqualsFlatContaminationCase3() {
testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3, "");
}
WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList(""));
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
Pair<List<File>, List<String>> test1 = executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec);
spec = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList(""));
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
Pair<List<File>, List<String>> test2 = executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec);
//verify that the md5s match up.
Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0));
}
// --------------------------------------------------------------------------------------------------------------
//
@ -250,50 +124,39 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test(enabled = false)
public void testHCContaminationDownsamplingFlat() {
final String baseCommand = "-T HaplotypeCaller -R " + b36KGReference + " --no_cmdline_in_header --dbsnp " + b36dbSNP129;
WalkerTestSpec spec = new WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contamination 0.20", 1,
Arrays.asList("c3a253467ead7b1cfe9fd9dd310828b1"));
executeTest("HC calling with contamination_percentage_to_filter 0.20", spec);
}
// HaplotypeCaller can only (currently) use flat contamination reduction, not per-sample. Until that is implemented, this test
@Test(enabled = false)
public void testHCCannotProcessPerSampleContamination() {
final String baseCommand = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:3,000,000-5,000,000";
final String bam1 = "NA11918.with.1.NA12842.reduced.bam";
final String perSampleFile = ArtificalBAMLocation + "contamination.case.1.txt";
WalkerTestSpec spec = new WalkerTestSpec(
baseCommand + " -I " + ArtificalBAMLocation + bam1 + " -o %s -contaminationFile " + perSampleFile, 1,
UserException.class);
executeTest("HC should fail on per-Sample contamination removal.", spec);
@DataProvider(name="PerSampleEqualFlatContamBamsHC")
public Object[][] makePerSampleEqualFlatContamBamsHC() {
final List<Object[]> tests = new LinkedList<Object[]>();
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0 }) ;
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ;
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ;
return tests.toArray(new Object[][]{});
}
private void testHCFlatContamination(final String bam1, final String bam2, final Double downsampling, final String md5) {
final String baseCommand = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:3,000,000-5,000,000";
@Test(dataProvider = "PerSampleEqualFlatContamBamsHC")
private void testPerSampleEqualsFlatHC(final String bam1, final String bam2, final String persampleFile, final Double downsampling) {
final String command = baseCommandHC + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s ";
WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList(""));
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
Pair<List<File>, List<String>> test1= executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec);
WalkerTestSpec spec2 = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList(""));
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
Pair<List<File>, List<String>> test2=executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec);
//verify that the md5s match up.
Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0));
WalkerTestSpec spec = new WalkerTestSpec(
baseCommand + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination " + downsampling.toString(), 1,
Arrays.asList(md5));
executeTest("HC test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " downsampling " + downsampling.toString(), spec);
}
@Test(enabled = false)
public void testHCFlatContaminationCase1() {
testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "c3e695381d8627e3922d8c642b66c3ce");
}
@Test(enabled = false)
public void testHCFlatContaminationCase2() {
testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "002d2b45336d88d7c04e19f9f26e29d9");
}
@Test(enabled = false)
public void testHCFlatContaminationCase3() {
testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "1809a33ac112d1a3bd7a071c566794dd");
}
}
}

View File

@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "98f4d78aad745c6e853b81b2e4e207b4");
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "4dd1b38f0389e339ce8a05956956aa8a");
}
}

View File

@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","25902d7a6a0c00c60c2d5845dfaa1a4c");
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","39f559996f8d429839c585bbab68dbde");
}
@Test(enabled = true)

View File

@ -56,8 +56,8 @@ import java.util.List;
public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
private final static String baseCommandIndels = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
private final static String baseCommandIndels = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
// --------------------------------------------------------------------------------------------------------------
//
@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("ef8151aa699da3272c1ae0986d16ca21"));
Arrays.asList("3c8727ee6e2a6f10ab728c4869dd5b92"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -88,7 +88,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("7f88229ccefb74513efb199b61183cb8"));
Arrays.asList("0cbe889e03bab6512680ecaebd52c536"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("1928ad48bcd0ca180e046bc235cfb3f4"));
Arrays.asList("c6f0fa039ca5672469838bc9f52c72d3"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("6663e434a7b549f23bfd52db90e53a1a"));
Arrays.asList("475f8148123792064130faf9f9030fec"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("581c552664e536df6d0f102fb0d10e5a"));
Arrays.asList("a7e4e1bd128424d46cffdd538b220074"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("5596851d19582dd1af3901b7d703ae0a"));
Arrays.asList("8682738c2c66b502cdbf7db466a5c3e2"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("862d82c8aa35f1da4f9e67b5b48dfe52"));
Arrays.asList("d3721bee5edaa31fdd35edd7aa75feb3"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("8d9fc96be07db791737ac18135de4d63"));
Arrays.asList("a5b6d7b32953500d936d3dff512a6254"));
executeTest("test minIndelFraction 0.25", spec);
}

View File

@ -64,8 +64,8 @@ import java.util.Collections;
public class UnifiedGenotyperIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
// --------------------------------------------------------------------------------------------------------------
//
@ -85,7 +85,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("4aa226c00a242047cf427d0919003048"));
Arrays.asList("bc8a4e4ceb46776169b47146805c882a"));
executeTest("test SLOD", spec);
}
@ -101,7 +101,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testCompTrack() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("50937942e3d228614d2531c3be237709"));
Arrays.asList("21185d9a7519356ba672757f5a522971"));
executeTest("test using comp track", spec);
}
@ -175,12 +175,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "3b66f82dbb746875638e076bf51a1583" );
testHeterozosity( 0.01, "2f3051caa785c7c1e2a8b23fa4da90b1" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "714c1795334c7c62c046a75479381ae6" );
testHeterozosity( 1.0 / 1850, "228df9e38580d8ffe1134da7449fa35e" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -196,7 +196,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "6f79205f7ed8006470f056f6805db6c8";
private final static String COMPRESSED_OUTPUT_MD5 = "eebec02fdde9937bffaf44902ace6207";
@Test
public void testCompressedOutput() {
@ -217,24 +217,25 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "d408b4661b820ed86272415b8ea08780";
String md5 = "1f3fad09a63269c36e871e7ee04ebfaa";
final String myCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
Arrays.asList(md5));
executeTest("test parallelization (single thread)", spec1);
GenomeAnalysisEngine.resetRandomGenerator();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1,
myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1,
Arrays.asList(md5));
executeTest("test parallelization (2 threads)", spec2);
GenomeAnalysisEngine.resetRandomGenerator();
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1,
myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1,
Arrays.asList(md5));
executeTest("test parallelization (4 threads)", spec3);
}
@ -252,7 +253,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("31be725b2a7c15e9769391ad940c0587"));
Arrays.asList("9f4e663e3b156b14fd55df3f5f0336a5"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -271,7 +272,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("dcc5cec42730567982def16da4a7f286"));
Arrays.asList("260bb73e2900334d5c3ff8123be0d2d8"));
executeTest(String.format("test calling with BAQ"), spec);
}

View File

@ -53,7 +53,7 @@ import java.util.Arrays;
public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
// --------------------------------------------------------------------------------------------------------------
//
@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("a9466c1e3ce1fc4bac83086b25a6df54"));
Arrays.asList("7f26ca78e550afa28df11d593c90ec9a"));
executeTest("test MultiSample Pilot1", spec);
}
@ -88,22 +88,22 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("aaadb2a355d87344eabb6ac4495a11e4"));
Arrays.asList("02b521fe88a6606a29c12c0885c3debd"));
executeTest("test SingleSample Pilot2", spec);
}
@Test
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("06c85e8eab08b67244cf38fc785aca22"));
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("dd5ad3beaa75319bb2ef1434d2dd9f73"));
executeTest("test Multiple SNP alleles", spec);
}
@Test
public void testBadRead() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
Arrays.asList("d915535c1458733f09f82670092fcab6"));
executeTest("test bad read", spec);
}
@ -111,16 +111,16 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
@Test
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("f3da1ff1e49a831af055ca52d6d07dd7"));
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("a973298b2801b80057bea88507e2858d"));
executeTest("test reverse trim", spec);
}
@Test
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("20ff311f363c51b7385a76f6f296759c"));
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("8d91d98c4e79897690d3c6918b6ac761"));
executeTest("test mismatched PLs", spec);
}
}

View File

@ -62,7 +62,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("ffde0d5e23523e4bd9e7e18f62d37d0f"));
executeTest("test calling on a ReducedRead BAM", spec);
}
@ -74,13 +74,13 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test
public void testReducedBamINDELs() {
testReducedCalling("INDEL", "4b4902327fb132f9aaab3dd5ace934e1");
testReducedCalling("INDEL", "942930038cf7fc9a80b969461aaa9aa6");
}
private void testReducedCalling(final String model, final String md5) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1,
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1,
Arrays.asList(md5));
executeTest("test calling on a ReducedRead BAM with " + model, spec);
}

View File

@ -57,14 +57,14 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal
public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest {
private void HCTestComplexVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
}
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "4a3479fc4ad387d381593b328f737a1b");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "df7be117bd3d256c4a5fbde925ecd19b");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
}
private void HCTestComplexGGA(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec);
}
@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"b7a01525c00d02b3373513a668a43c6a");
"b787be740423b950f8529ccc838fabdd");
}
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"a2a42055b068334f415efb07d6bb9acd");
"8e6a2002c59eafb78bdbf1db9660164b");
}
}

View File

@ -46,154 +46,43 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/27/12
*/
import net.sf.samtools.*;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class DeBruijnAssemblerUnitTest extends BaseTest {
private final static boolean DEBUG = false;
@Test(enabled = !DEBUG)
public void testReferenceCycleGraph() {
String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC";
String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC";
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true), Collections.<Haplotype>emptyList());
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true), Collections.<Haplotype>emptyList());
Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation.");
Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
}
private static class MockBuilder extends DeBruijnGraphBuilder {
public final List<Kmer> addedPairs = new LinkedList<Kmer>();
private MockBuilder(final int kmerSize) {
super(new DeBruijnGraph(kmerSize));
}
@Override
public void addKmerPair(Kmer kmerPair, int multiplicity) {
logger.info("addKmerPair" + kmerPair);
addedPairs.add(kmerPair);
}
@Override
public void flushKmersToGraph(boolean addRefEdges) {
// do nothing
}
}
@DataProvider(name = "AddReadKmersToGraph")
public Object[][] makeAddReadKmersToGraphData() {
public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@DataProvider(name = "MyDataProvider")
public Object[][] makeMyDataProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
// this functionality can be adapted to provide input data for whatever you might want in your data
final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
final int readLen = bases.length();
final List<Integer> allBadStarts = new ArrayList<Integer>(readLen);
for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i);
final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000";
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "55faaae5617857e2b29848367999aa3e"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "e32b7fc4de29ed141dcafc0d789d5ed6"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "ecac86e8ef4856e6dfa306c436e9b545"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "7828256b82df377cc3a26a55dbf68f91"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "e41e0acf172a994e938a150390badd39"});
for ( final int kmerSize : Arrays.asList(3, 4, 5) ) {
for ( final int nBadQuals : Arrays.asList(0, 1, 2) ) {
for ( final List<Integer> badStarts : Utils.makePermutations(allBadStarts, nBadQuals, false) ) {
tests.add(new Object[]{bases, kmerSize, badStarts});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "AddReadKmersToGraph", enabled = ! DEBUG)
public void testAddReadKmersToGraph(final String bases, final int kmerSize, final List<Integer> badQualsSites) {
final int readLen = bases.length();
final DeBruijnAssembler assembler = new DeBruijnAssembler();
final MockBuilder builder = new MockBuilder(kmerSize);
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
final byte[] quals = Utils.dupBytes((byte)20, bases.length());
for ( final int badSite : badQualsSites ) quals[badSite] = 0;
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLen);
read.setReadBases(bases.getBytes());
read.setBaseQualities(quals);
final Set<String> expectedBases = new HashSet<String>();
final Set<Integer> expectedStarts = new LinkedHashSet<Integer>();
for ( int i = 0; i < readLen; i++) {
boolean good = true;
for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing
good &= i + j < readLen && quals[i+j] >= assembler.getMinBaseQualityToUseInAssembly();
}
if ( good ) {
expectedStarts.add(i);
expectedBases.add(bases.substring(i, i + kmerSize + 1));
}
}
assembler.addReadKmersToGraph(builder, Arrays.asList(read));
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
for ( final Kmer addedKmer : builder.addedPairs ) {
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
}
}
@DataProvider(name = "AddGGAKmersToGraph")
public Object[][] makeAddGGAKmersToGraphData() {
List<Object[]> tests = new ArrayList<Object[]>();
// this functionality can be adapted to provide input data for whatever you might want in your data
final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
final int readLen = bases.length();
final List<Integer> allBadStarts = new ArrayList<Integer>(readLen);
for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i);
for ( final int kmerSize : Arrays.asList(3, 4, 5) ) {
tests.add(new Object[]{bases, kmerSize});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "AddGGAKmersToGraph", enabled = ! DEBUG)
public void testAddGGAKmersToGraph(final String bases, final int kmerSize) {
final int readLen = bases.length();
final DeBruijnAssembler assembler = new DeBruijnAssembler();
final MockBuilder builder = new MockBuilder(kmerSize);
final Set<String> expectedBases = new HashSet<String>();
final Set<Integer> expectedStarts = new LinkedHashSet<Integer>();
for ( int i = 0; i < readLen; i++) {
boolean good = true;
for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing
good &= i + j < readLen;
}
if ( good ) {
expectedStarts.add(i);
expectedBases.add(bases.substring(i, i + kmerSize + 1));
}
}
assembler.addGGAKmersToGraph(builder, Arrays.asList(new Haplotype(bases.getBytes())));
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
for ( final Kmer addedKmer : builder.addedPairs ) {
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
}
/**
* Example testng test using MyDataProvider
*/
@Test(dataProvider = "MyDataProvider")
public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) {
final String commandLine = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s %s -ERC %s --no_cmdline_in_header",
b37KGReference, bam, intervals, mode);
final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
executeTest(name, spec);
}
}

View File

@ -71,19 +71,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
private void HCTest(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCaller: args=" + args, spec);
}
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "baa5a2eedc8f06ce9f8f98411ee09f8a");
HCTest(CEUTRIO_BAM, "", "c0b1b64c6005cd3640ffde5dbc10174b");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "f09e03d41238697b23f95716a12667cb");
HCTest(NA12878_BAM, "", "439ce9024f04aad08eab1526d887e295");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -93,8 +93,8 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"130d36448faeb7b8d4bce4be12dacd3a");
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"b09437f11db40abd49195110e50692c2");
}
@Test
@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "7c20aa62633f4ce8ebf12950fbf05ec0");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c57c463542304fb7b2576e531faca89e");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -147,7 +147,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerNearbySmallIntervals() {
HCTestNearbySmallIntervals(NA12878_BAM, "", "0ddc56f0a0fbcfefda79aa20b2ecf603");
HCTestNearbySmallIntervals(NA12878_BAM, "", "75820a4558a559b3e1636fdd1b776ea2");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -157,7 +157,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@ -185,16 +185,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("5fe9310addf881bed4fde2354e59ce34"));
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@Test
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("26a9917f6707536636451266de0116c3"));
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("6a9222905c740b9208bf3c67478514eb"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("cc6f2a76ee97ecc14a5f956ffbb21d88"));
Arrays.asList("58a0089e6ebf7cee414adb7a6002d43f"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -217,7 +217,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("51e91c8af61a6b47807165906baefb00"));
Arrays.asList("1352cbe1404aefc94eb8e044539a9882"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
}

View File

@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "9da4cc89590c4c64a36f4a9c820f8609"});
tests.add(new Object[]{nct, "6f8c3cac54eb1460e2c65fe00978b1c1"});
}
return tests.toArray(new Object[][]{});

View File

@ -87,15 +87,6 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
}
private enum Assembler {DEBRUIJN_ASSEMBLER, READ_THREADING_ASSEMBLER}
private LocalAssemblyEngine createAssembler(final Assembler type) {
switch ( type ) {
case DEBRUIJN_ASSEMBLER: return new DeBruijnAssembler();
case READ_THREADING_ASSEMBLER: return new ReadThreadingAssembler();
default: throw new IllegalStateException("Unexpected " + type);
}
}
@DataProvider(name = "AssembleIntervalsData")
public Object[][] makeAssembleIntervalsData() {
List<Object[]> tests = new ArrayList<Object[]>();
@ -107,12 +98,10 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
final int stepSize = 200;
final int nReadsToUse = 5;
for ( final Assembler assembler : Assembler.values() ) {
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
tests.add(new Object[]{assembler, refLoc, nReadsToUse});
}
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
tests.add(new Object[]{new ReadThreadingAssembler(), refLoc, nReadsToUse});
}
return tests.toArray(new Object[][]{});
@ -130,13 +119,11 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
final int variantStepSize = 1;
final int nReadsToUse = 5;
for ( final Assembler assembler : Assembler.values() ) {
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
for ( int variantStart = windowSize / 2 - 10; variantStart < windowSize / 2 + 10; variantStart += variantStepSize ) {
tests.add(new Object[]{assembler, refLoc, nReadsToUse, variantStart});
}
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
for ( int variantStart = windowSize / 2 - 10; variantStart < windowSize / 2 + 10; variantStart += variantStepSize ) {
tests.add(new Object[]{new ReadThreadingAssembler(), refLoc, nReadsToUse, variantStart});
}
}
@ -144,7 +131,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
@Test(dataProvider = "AssembleIntervalsData")
public void testAssembleRef(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse) {
public void testAssembleRef(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
@ -163,7 +150,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
@Test(dataProvider = "AssembleIntervalsWithVariantData")
public void testAssembleRefAndSNP(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
public void testAssembleRefAndSNP(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
final Allele refBase = Allele.create(refBases[variantSite], true);
final Allele altBase = Allele.create((byte)(refBase.getBases()[0] == 'A' ? 'C' : 'A'), false);
@ -172,7 +159,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
@Test(dataProvider = "AssembleIntervalsWithVariantData")
public void testAssembleRefAndDeletion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
public void testAssembleRefAndDeletion(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
for ( int deletionLength = 1; deletionLength < 10; deletionLength++ ) {
final Allele refBase = Allele.create(new String(refBases).substring(variantSite, variantSite + deletionLength + 1), true);
@ -183,7 +170,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
@Test(dataProvider = "AssembleIntervalsWithVariantData")
public void testAssembleRefAndInsertion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
public void testAssembleRefAndInsertion(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
for ( int insertionLength = 1; insertionLength < 10; insertionLength++ ) {
final Allele refBase = Allele.create(refBases[variantSite], false);
@ -193,7 +180,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
}
private void testAssemblyWithVariant(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final int nReadsToUse, final VariantContext site) {
private void testAssemblyWithVariant(final ReadThreadingAssembler assembler, final byte[] refBases, final GenomeLoc loc, final int nReadsToUse, final VariantContext site) {
final String preRef = new String(refBases).substring(0, site.getStart());
final String postRef = new String(refBases).substring(site.getEnd() + 1, refBases.length);
final byte[] altBases = (preRef + site.getAlternateAllele(0).getBaseString() + postRef).getBytes();
@ -217,7 +204,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
private List<Haplotype> assemble(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final List<GATKSAMRecord> reads) {
private List<Haplotype> assemble(final ReadThreadingAssembler assembler, final byte[] refBases, final GenomeLoc loc, final List<GATKSAMRecord> reads) {
final Haplotype refHaplotype = new Haplotype(refBases, true);
final Cigar c = new Cigar();
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
@ -225,9 +212,8 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
final ActiveRegion activeRegion = new ActiveRegion(loc, null, true, genomeLocParser, 0);
activeRegion.addAll(reads);
final LocalAssemblyEngine engine = createAssembler(assembler);
// logger.warn("Assembling " + activeRegion + " with " + engine);
return engine.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.<VariantContext>emptyList(), null);
return assembler.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.<VariantContext>emptyList(), null);
}
@DataProvider(name = "SimpleAssemblyTestData")
@ -239,30 +225,25 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
final int windowSize = 200;
final int end = start + windowSize;
final Map<Assembler, Integer> edgeExcludesByAssembler = new EnumMap<>(Assembler.class);
edgeExcludesByAssembler.put(Assembler.DEBRUIJN_ASSEMBLER, 26);
edgeExcludesByAssembler.put(Assembler.READ_THREADING_ASSEMBLER, 25); // TODO -- decrease to zero when the edge calling problem is fixed
final int excludeVariantsWithinXbp = 25; // TODO -- decrease to zero when the edge calling problem is fixed
final String ref = new String(seq.getSubsequenceAt(contig, start, end).getBases());
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, start, end);
for ( final Assembler assembler : Assembler.values() ) {
final int excludeVariantsWithXbp = edgeExcludesByAssembler.get(assembler);
for ( int snpPos = 0; snpPos < windowSize; snpPos++) {
if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) {
if ( snpPos > excludeVariantsWithinXbp && (windowSize - snpPos) >= excludeVariantsWithinXbp ) {
final byte[] altBases = ref.getBytes();
altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A';
final String alt = new String(altBases);
tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt});
tests.add(new Object[]{"SNP at " + snpPos, new ReadThreadingAssembler(), refLoc, ref, alt});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "SimpleAssemblyTestData")
public void testSimpleAssembly(final String name, final Assembler assembler, final GenomeLoc loc, final String ref, final String alt) {
public void testSimpleAssembly(final String name, final ReadThreadingAssembler assembler, final GenomeLoc loc, final String ref, final String alt) {
final byte[] refBases = ref.getBytes();
final byte[] altBases = alt.getBytes();

View File

@ -0,0 +1,408 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import net.sf.samtools.SAMFileHeader;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.variant.variantcontext.GenotypeType;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class ReferenceConfidenceModelUnitTest extends BaseTest {
GenomeLocParser parser;
final String RGID = "ID1";
GATKSAMReadGroupRecord rg;
final String sample = "NA12878";
final Set<String> samples = Collections.singleton(sample);
SAMFileHeader header;
ReferenceConfidenceModel model;
@BeforeClass
public void setUp() throws Exception {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
rg = new GATKSAMReadGroupRecord(RGID);
rg.setSample(sample);
header.addReadGroup(rg);
parser = new GenomeLocParser(header.getSequenceDictionary());
}
@BeforeMethod
public void setupModel() {
model = new ReferenceConfidenceModel(parser, samples, header, 10);
}
@DataProvider(name = "CalcNIndelInformativeReadsData")
public Object[][] makeMyDataProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
{ // very basic testing
final String ref = "ACGT";
final String read = "ACGT";
tests.add(new Object[]{read, ref, 1, Arrays.asList(1, 1, 1, 0)});
tests.add(new Object[]{read, ref, 2, Arrays.asList(1, 1, 0, 0)});
tests.add(new Object[]{read, ref, 3, Arrays.asList(1, 0, 0, 0)});
tests.add(new Object[]{read, ref, 4, Arrays.asList(0, 0, 0, 0)});
}
{ // actually interesting case where some sites aren't informative
final String ref = "NNAAAANN";
final String read1 = "NNA";
final String read2 = "NNAA";
final String read3 = "NNAAA";
final String read4 = "NNAAAA";
final String read5 = "NNAAAAN";
tests.add(new Object[]{read1, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)});
tests.add(new Object[]{read2, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)});
tests.add(new Object[]{read3, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)});
tests.add(new Object[]{read4, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)});
tests.add(new Object[]{read5, ref, 1, Arrays.asList(1, 1, 1, 1, 1, 1, 0, 0)});
}
{
for ( final String repeatUnit : Arrays.asList("A", "CA", "TAG", "TAGC", "TCAGA")) {
final String anchor = Utils.dupString("N", repeatUnit.length());
for ( int nUnits = 1; nUnits < 10; nUnits++ ) {
final String repeat = Utils.dupString(repeatUnit, nUnits);
final String ref = anchor + repeat + anchor;
for ( int readLen = repeatUnit.length(); readLen < repeat.length(); readLen++ ) {
final String read = anchor + repeat.substring(0, readLen);
final List<Integer> expected = new LinkedList<>();
for ( int i = 0; i < anchor.length(); i++ ) expected.add(1);
for ( int i = 0; i < repeat.length(); i++ ) expected.add(readLen == repeat.length() ? 1 : 0);
for ( int i = 0; i < anchor.length(); i++ ) expected.add(0);
tests.add(new Object[]{read, ref, repeatUnit.length(), expected});
final List<Integer> result = new ArrayList<>(Collections.nCopies(ref.length() - anchor.length(), 1));
result.addAll(Collections.nCopies(anchor.length(), 0));
tests.add(new Object[]{ref, ref, repeatUnit.length(), result});
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "CalcNIndelInformativeReadsData")
public void testCalcNIndelInformativeReads(final String readBases, final String ref, final int maxIndelSize, final List<Integer> expected ) {
final byte qual = (byte)30;
final byte[] quals = Utils.dupBytes(qual, readBases.length());
for ( int i = 0; i < readBases.getBytes().length; i++ ) {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), quals, readBases.length() + "M");
final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, i, i);
final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), i);
final int actual = model.calcNIndelInformativeReads(pileup, i, ref.getBytes(), maxIndelSize);
Assert.assertEquals(actual, (int)expected.get(i), "failed at position " + i);
}
}
@Test
public void testClose() {
model.close();
}
@Test
public void testWorstGL() {
final GenotypeLikelihoods gq10 = GenotypeLikelihoods.fromPLField("0,10,100");
final GenotypeLikelihoods gq20 = GenotypeLikelihoods.fromPLField("0,20,200");
final GenotypeLikelihoods gq0 = GenotypeLikelihoods.fromPLField("20,0,200");
Assert.assertSame(model.getGLwithWorstGQ(gq10, gq20), gq10);
Assert.assertSame(model.getGLwithWorstGQ(gq20, gq10), gq10);
Assert.assertSame(model.getGLwithWorstGQ(gq10, gq0), gq0);
Assert.assertSame(model.getGLwithWorstGQ(gq0, gq10), gq0);
}
@Test
public void testIndelLikelihoods() {
GenotypeLikelihoods prev = model.getIndelPLs(0);
Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0});
Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0);
for ( int i = 1; i < 10000; i++ ) {
final GenotypeLikelihoods current = model.getIndelPLs(i);
final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF);
final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF);
Assert.assertTrue(prevGQ < currGQ, "GQ Failed with prev " + prev + " curr " + current + " at " + i);
Assert.assertTrue(prev.getAsPLs()[1] < current.getAsPLs()[1], "het PL failed with prev " + prev + " curr " + current + " at " + i);
Assert.assertTrue(prev.getAsPLs()[2] < current.getAsPLs()[2], "hom-var PL Failed with prev " + prev + " curr " + current + " at " + i);
// logger.warn("result at " + i + " is " + current);
prev = current;
}
}
@Test
public void testOverlappingVariantContext() {
final VariantContext vc10 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 10, Arrays.asList("A", "C"));
final VariantContext vc13 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 13, Arrays.asList("A", "C"));
final VariantContext vc12_15 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 12, Arrays.asList("ACAT", "A"));
final VariantContext vc18 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 18, Arrays.asList("A", "ACAT"));
final List<VariantContext> calls = Arrays.asList(vc13, vc12_15, vc18, vc10);
checkOverlapping(8, calls, null);
checkOverlapping(9, calls, null);
checkOverlapping(10, calls, vc10);
checkOverlapping(11, calls, null);
checkOverlapping(12, calls, vc12_15);
checkOverlapping(13, calls, vc13);
checkOverlapping(14, calls, vc12_15);
checkOverlapping(15, calls, vc12_15);
checkOverlapping(16, calls, null);
checkOverlapping(17, calls, null);
checkOverlapping(18, calls, vc18);
checkOverlapping(19, calls, null);
checkOverlapping(20, calls, null);
}
private void checkOverlapping(final int pos, Collection<VariantContext> calls, final VariantContext expected) {
final GenomeLoc loc = parser.createGenomeLoc(parser.getContigs().getSequences().get(0).getSequenceName(), pos, pos);
final VariantContext actual = model.getOverlappingVariantContext(loc, calls);
Assert.assertEquals(actual, expected);
}
//
// test reference calculation
//
private class RefConfData {
final String ref;
final int extension;
final Haplotype refHap;
final GenomeLoc refLoc, paddedRefLoc;
final ActiveRegion region;
int readCounter = 0;
private RefConfData(String ref, int extension) {
this.ref = ref;
this.extension = extension;
refLoc = parser.createGenomeLoc("chr1", getStart(), getEnd());
paddedRefLoc = parser.createGenomeLoc("chr1", getStart() - extension, getEnd() + extension);
region = new ActiveRegion(getRefLoc(), parser, extension);
final String pad = Utils.dupString("N", extension);
refHap = ReferenceConfidenceModel.createReferenceHaplotype(getActiveRegion(), (pad + ref + pad).getBytes(), getPaddedRefLoc());
}
public GenomeLoc getRefLoc() { return refLoc; }
public GenomeLoc getPaddedRefLoc() { return paddedRefLoc; }
public ActiveRegion getActiveRegion() { return region; }
public Haplotype getRefHap() { return refHap; }
public int getStart() { return 100; }
public int getEnd() { return getStart() + getRefLength() - 1; }
public byte[] getRefBases() { return ref.getBytes(); }
public int getRefLength() { return ref.length(); }
public GATKSAMRecord makeRead(final int start, final int length) {
final byte[] quals = Utils.dupBytes((byte)30, length);
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read " + readCounter++, 0, start + getStart(), ref.substring(start, start + length).getBytes(), quals, length + "M");
read.setReadGroup(rg);
return read;
}
}
@DataProvider(name = "RefConfidenceData")
public Object[][] makeRefConfidenceData() {
List<Object[]> tests = new ArrayList<>();
for ( int i = 0; i < 10; i++ ) {
for ( final int extension : Arrays.asList(0, 10) ) {
tests.add(new Object[]{i, extension});
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "RefConfidenceData")
public void testRefConfidenceBasic(final int nReads, final int extension) {
final RefConfData data = new RefConfData("ACGTAACCGGTT", extension);
final List<Haplotype> haplotypes = Arrays.asList(data.getRefHap());
final List<VariantContext> calls = Collections.emptyList();
for ( int i = 0; i < nReads; i++ ) {
data.getActiveRegion().add(data.makeRead(0, data.getRefLength()));
}
final Map<String, PerReadAlleleLikelihoodMap> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
final List<Integer> expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads);
final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
checkReferenceModelResult(data, contexts, expectedDPs, calls);
}
@Test
public void testRefConfidencePartialReads() {
final String ref = "ACGTAACCGGTT";
for ( int readLen = 3; readLen < ref.length(); readLen++ ) {
for ( int start = 0; start < ref.length() - readLen; start++ ) {
final RefConfData data = new RefConfData(ref, 0);
final List<Haplotype> haplotypes = Arrays.asList(data.getRefHap());
final List<VariantContext> calls = Collections.emptyList();
data.getActiveRegion().add(data.makeRead(start, readLen));
final Map<String, PerReadAlleleLikelihoodMap> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
final List<Integer> expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0));
for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1);
final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
checkReferenceModelResult(data, contexts, expectedDPs, calls);
}
}
}
@Test
public void testRefConfidenceWithCalls() {
final RefConfData xxxdata = new RefConfData("ACGTAACCGGTT", 0);
final int start = xxxdata.getStart();
final int stop = xxxdata.getEnd();
for ( int nReads = 0; nReads < 2; nReads++ ) {
final VariantContext vcStart = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start, Arrays.asList("A", "C"));
final VariantContext vcEnd = GATKVariantContextUtils.makeFromAlleles("test", "chr1", stop, Arrays.asList("A", "C"));
final VariantContext vcMiddle = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 2, Arrays.asList("A", "C"));
final VariantContext vcDel = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 4, Arrays.asList("ACG", "A"));
final VariantContext vcIns = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 8, Arrays.asList("A", "ACG"));
final List<VariantContext> allCalls = Arrays.asList(vcStart, vcEnd, vcMiddle, vcDel, vcIns);
for ( int n = 1; n <= allCalls.size(); n++ ) {
for ( final List<VariantContext> calls : Utils.makePermutations(allCalls, n, false) ) {
// logger.warn("Executing " + n + " " + calls.size());
final RefConfData data = new RefConfData("ACGTAACCGGTT", 0);
final List<Haplotype> haplotypes = Arrays.asList(data.getRefHap());
for ( int i = 0; i < nReads; i++ ) {
data.getActiveRegion().add(data.makeRead(0, data.getRefLength()));
}
final Map<String, PerReadAlleleLikelihoodMap> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
final List<Integer> expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads);
final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
checkReferenceModelResult(data, contexts, expectedDPs, calls);
}
}
}
}
private void checkReferenceModelResult(final RefConfData data, final List<VariantContext> contexts, final List<Integer> expectedDPs, final List<VariantContext> calls) {
Assert.assertNotNull(contexts);
final GenomeLoc loc = data.getActiveRegion().getExtendedLoc();
final List<Boolean> seenBP = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), false));
for ( int i = 0; i < loc.size(); i++ ) {
final GenomeLoc curPos = parser.createGenomeLoc(loc.getContig(), loc.getStart() + i);
final VariantContext call = model.getOverlappingVariantContext(curPos, calls);
final VariantContext refModel = model.getOverlappingVariantContext(curPos, contexts);
if ( ! data.getActiveRegion().getLocation().containsP(curPos) ) {
// part of the extended interval, but not the full interval
Assert.assertNull(refModel);
continue;
}
if ( call != null ) {
Assert.assertEquals(refModel, call, "Should have found call " + call + " but found " + refModel + " instead");
} else {
final int expectedDP = expectedDPs.get(curPos.getStart() - data.getActiveRegion().getLocation().getStart());
Assert.assertEquals(refModel.getStart(), loc.getStart() + i);
Assert.assertEquals(refModel.getEnd(), loc.getStart() + i);
Assert.assertFalse(refModel.hasLog10PError());
Assert.assertEquals(refModel.getAlternateAlleles().size(), 1);
Assert.assertEquals(refModel.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE);
Assert.assertTrue(refModel.hasGenotype(sample));
final Genotype g = refModel.getGenotype(sample);
Assert.assertTrue(g.hasAD());
Assert.assertTrue(g.hasDP());
Assert.assertEquals(g.getDP(), expectedDP);
Assert.assertTrue(g.hasGQ());
Assert.assertTrue(g.hasPL());
Assert.assertTrue(g.hasExtendedAttribute(ReferenceConfidenceModel.INDEL_INFORMATIVE_DEPTH));
}
final VariantContext vc = call == null ? refModel : call;
if ( curPos.getStart() == vc.getStart() ) {
for ( int pos = vc.getStart(); pos <= vc.getEnd(); pos++ ) {
final int j = pos - data.getActiveRegion().getLocation().getStart();
Assert.assertFalse(seenBP.get(j));
seenBP.set(j, true);
}
}
}
for ( int i = 0; i < seenBP.size(); i++ ) {
Assert.assertEquals((boolean)seenBP.get(i), true);
}
}
}

View File

@ -49,7 +49,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
@ -231,7 +230,7 @@ public class BaseGraphUnitTest extends BaseTest {
final File tmp = File.createTempFile("tmp", "dot");
tmp.deleteOnExit();
new SeqGraph().printGraph(tmp, 10);
new DeBruijnGraph().printGraph(tmp, 10);
new TestGraph().printGraph(tmp, 10);
}
@Test
@ -248,71 +247,6 @@ public class BaseGraphUnitTest extends BaseTest {
Assert.assertEquals(actualSet, expectedSet);
}
@Test(enabled = true)
public void testPruneGraph() {
DeBruijnGraph graph = new DeBruijnGraph();
DeBruijnGraph expectedGraph = new DeBruijnGraph();
DeBruijnVertex v = new DeBruijnVertex("ATGG");
DeBruijnVertex v2 = new DeBruijnVertex("ATGGA");
DeBruijnVertex v3 = new DeBruijnVertex("ATGGT");
DeBruijnVertex v4 = new DeBruijnVertex("ATGGG");
DeBruijnVertex v5 = new DeBruijnVertex("ATGGC");
DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC");
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new BaseEdge(false, 1));
graph.addEdge(v2, v3, new BaseEdge(false, 3));
graph.addEdge(v3, v4, new BaseEdge(false, 5));
graph.addEdge(v4, v5, new BaseEdge(false, 3));
graph.addEdge(v5, v6, new BaseEdge(false, 2));
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3));
expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5));
expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3));
graph.pruneGraph(2);
Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
graph = new DeBruijnGraph();
expectedGraph = new DeBruijnGraph();
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new BaseEdge(true, 1));
graph.addEdge(v2, v3, new BaseEdge(false, 3));
graph.addEdge(v3, v4, new BaseEdge(false, 5));
graph.addEdge(v4, v5, new BaseEdge(false, 3));
expectedGraph.addVertex(v);
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v, v2, new BaseEdge(true, 1));
expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3));
expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5));
expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3));
graph.pruneGraph(2);
Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
}
@Test(enabled = true)
public void testGetBases() {
@ -324,7 +258,7 @@ public class BaseGraphUnitTest extends BaseTest {
vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize)));
}
final String result = new String(new DeBruijnGraph().getBasesForPath(vertexes));
final String result = new String(new TestGraph().getBasesForPath(vertexes));
Assert.assertEquals(result, testString.substring(kmerSize - 1));
}
}

View File

@ -72,7 +72,7 @@ public class SeqGraphUnitTest extends BaseTest {
}
public SeqGraph calcGraph() {
final DeBruijnGraph deBruijnGraph = new DeBruijnGraph();
final TestGraph deBruijnGraph = new TestGraph();
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for (int i = 0; i < kmersInSequence - 1; i++) {
// get the kmers

View File

@ -86,7 +86,7 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest {
assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests
assembler.setRecoverDanglingTails(false); // needed to pass some of the tests
assembler.setDebugGraphTransformations(true);
final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.<Haplotype>emptyList()).get(0);
final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.<Haplotype>emptyList()).get(0).getGraph();
if ( DEBUG ) graph.printGraph(new File("test.dot"), 0);
return graph;
}

View File

@ -212,8 +212,8 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion
tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp
tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps
tests.add(new Object[]{"AAAAA", "C", "1M4D1M", true, -1}); // funky SW alignment
tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", true, 1}); // very little data
tests.add(new Object[]{"AAAAA", "C", "1M4D1M", false, -1}); // funky SW alignment
tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", false, 1}); // very little data
tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch
tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex
@ -253,7 +253,13 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
Assert.assertTrue(altSink != null, "We did not find a non-reference sink");
// confirm that the SW alignment agrees with our expectations
final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink);
final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink, 0);
if ( result == null ) {
Assert.assertFalse(cigarIsGood);
return;
}
Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString());
// confirm that the goodness of the cigar agrees with our expectations

View File

@ -0,0 +1,333 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.utils.gvcf;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.variant.vcf.VCFHeader;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class GVCFWriterUnitTest extends BaseTest {
private static class MockWriter implements VariantContextWriter {
final List<VariantContext> emitted = new ArrayList<>();
boolean headerWritten = false;
boolean closed = false;
@Override
public void writeHeader(VCFHeader header) {
headerWritten = true;
}
@Override
public void close() {
closed = true;
}
@Override
public void add(VariantContext vc) {
emitted.add(vc);
}
}
private MockWriter mockWriter;
private List<Integer> standardPartition = Arrays.asList(1, 10, 20);
private Allele REF = Allele.create("N", true);
private Allele ALT = Allele.create("A");
private List<Allele> ALLELES = Arrays.asList(REF, ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE);
private final String SAMPLE_NAME = "XXYYZZ";
@BeforeMethod
public void setUp() throws Exception {
mockWriter = new MockWriter();
}
@Test
public void testHeaderWriting() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.writeHeader(new VCFHeader());
Assert.assertTrue(mockWriter.headerWritten);
}
@Test
public void testClose() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.close();
Assert.assertTrue(mockWriter.closed);
}
@Test
public void testCloseWithoutClosingUnderlyingWriter() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.close(false);
Assert.assertFalse(mockWriter.closed);
}
private VariantContext makeHomRef(final String contig, final int start, final int GQ) {
final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, ALLELES);
final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF));
gb.GQ(GQ);
gb.DP(10);
gb.AD(new int[]{1, 2});
gb.PL(new int[]{0, 10, 100});
return vcb.genotypes(gb.make()).make();
}
private VariantContext makeNonRef(final String contig, final int start, final int GQ) {
final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT));
final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, ALT));
gb.GQ(GQ);
gb.DP(10);
gb.AD(new int[]{1, 2});
gb.PL(new int[]{0, 10, 100});
return vcb.genotypes(gb.make()).make();
}
private VariantContext makeDeletion(final String contig, final int start, final int size) {
final String del = Utils.dupString("A", size);
final String alt = del.substring(0, 1);
final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", contig, start, Arrays.asList(del, alt));
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(vc.getReference(), vc.getAlternateAllele(0)));
gb.GQ(50);
gb.DP(10);
gb.AD(new int[]{1, 2});
gb.PL(new int[]{0, 10, 100});
return vcb.genotypes(gb.make()).make();
}
@Test
public void testCloseEmitsLastVariant() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.add(makeHomRef("20", 1, 30));
Assert.assertEquals(mockWriter.emitted.size(), 0);
writer.close();
Assert.assertTrue(mockWriter.closed);
Assert.assertEquals(mockWriter.emitted.size(), 1);
}
@Test
public void testCloseDoesntEmitsLastVariantWhenNonRef() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.add(makeNonRef("20", 1, 30));
Assert.assertEquals(mockWriter.emitted.size(), 1);
writer.close();
Assert.assertTrue(mockWriter.closed);
Assert.assertEquals(mockWriter.emitted.size(), 1);
}
@Test
public void testCrossingContigBoundaryRef() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.add(makeHomRef("20", 1, 30));
writer.add(makeHomRef("20", 2, 30));
Assert.assertEquals(mockWriter.emitted.size(), 0);
writer.add(makeHomRef("21", 3, 30));
Assert.assertEquals(mockWriter.emitted.size(), 1);
assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false);
writer.close();
Assert.assertEquals(mockWriter.emitted.size(), 2);
assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, false);
}
@Test
public void testCrossingContigBoundaryNonRef() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.add(makeHomRef("20", 1, 30));
writer.add(makeHomRef("20", 2, 30));
Assert.assertEquals(mockWriter.emitted.size(), 0);
writer.add(makeNonRef("21", 3, 30));
Assert.assertEquals(mockWriter.emitted.size(), 2);
assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false);
assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, true);
}
@Test
public void testCrossingContigBoundaryNonRefThenNonRef() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.add(makeNonRef("20", 1, 30));
Assert.assertEquals(mockWriter.emitted.size(), 1);
writer.add(makeNonRef("21", 1, 30));
Assert.assertEquals(mockWriter.emitted.size(), 2);
assertGoodVC(mockWriter.emitted.get(0), "20", 1, 1, true);
assertGoodVC(mockWriter.emitted.get(1), "21", 1, 1, true);
}
private void assertGoodVC(final VariantContext vc, final String contig, final int start, final int stop, final boolean nonRef) {
Assert.assertEquals(vc.getChr(), contig);
Assert.assertEquals(vc.getStart(), start);
Assert.assertEquals(vc.getEnd(), stop);
if ( nonRef ) {
Assert.assertNotEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE);
} else {
Assert.assertEquals(vc.getNAlleles(), 2);
Assert.assertEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE);
Assert.assertEquals(vc.getAttributeAsInt(GVCFWriter.BLOCK_SIZE_INFO_FIELD, -1), stop - start + 1);
Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop);
Assert.assertTrue(vc.hasGenotypes());
Assert.assertTrue(vc.hasGenotype(SAMPLE_NAME));
Assert.assertEquals(vc.getGenotypes().size(), 1);
final Genotype g = vc.getGenotype(SAMPLE_NAME);
Assert.assertEquals(g.hasAD(), false);
Assert.assertEquals(g.hasLikelihoods(), false);
Assert.assertEquals(g.hasPL(), false);
Assert.assertEquals(g.hasDP(), true);
Assert.assertEquals(g.hasGQ(), true);
}
}
@Test
public void testVariantForcesNonRef() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.add(makeHomRef("20", 1, 30));
writer.add(makeHomRef("20", 2, 30));
Assert.assertEquals(mockWriter.emitted.size(), 0);
writer.add(makeNonRef("20", 3, 30));
writer.add(makeHomRef("20", 4, 30));
writer.add(makeHomRef("20", 5, 30));
Assert.assertEquals(mockWriter.emitted.size(), 2);
assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false);
assertGoodVC(mockWriter.emitted.get(1), "20", 3, 3, true);
writer.close();
assertGoodVC(mockWriter.emitted.get(2), "20", 4, 5, false);
}
@Test
public void testEmittingTwoBands() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.add(makeHomRef("20", 1, 0));
writer.add(makeHomRef("20", 2, 0));
Assert.assertEquals(mockWriter.emitted.size(), 0);
writer.add(makeHomRef("20", 3, 50));
writer.add(makeHomRef("20", 4, 50));
writer.close();
Assert.assertEquals(mockWriter.emitted.size(), 2);
assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false);
assertGoodVC(mockWriter.emitted.get(1), "20", 3, 4, false);
}
@Test
public void testNonContiguousBlocks() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.add(makeHomRef("20", 1, 0));
writer.add(makeHomRef("20", 2, 0));
writer.add(makeHomRef("20", 10, 0));
writer.add(makeHomRef("20", 11, 0));
writer.close();
Assert.assertEquals(mockWriter.emitted.size(), 2);
assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false);
assertGoodVC(mockWriter.emitted.get(1), "20", 10, 11, false);
}
@Test
public void testDeletion() {
final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
writer.add(makeHomRef("20", 1, 0));
writer.add(makeHomRef("20", 2, 0));
writer.add(makeDeletion("20", 3, 3));
writer.add(makeHomRef("20", 4, 0));
writer.add(makeHomRef("20", 5, 0));
writer.add(makeHomRef("20", 6, 0));
writer.add(makeHomRef("20", 7, 0));
writer.close();
Assert.assertEquals(mockWriter.emitted.size(), 3);
assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false);
assertGoodVC(mockWriter.emitted.get(1), "20", 3, 5, true);
assertGoodVC(mockWriter.emitted.get(2), "20", 6, 7, false);
}
@DataProvider(name = "BandPartitionData")
public Object[][] makeBandPartitionData() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{null, false});
tests.add(new Object[]{Collections.emptyList(), false});
tests.add(new Object[]{Arrays.asList(1), true});
tests.add(new Object[]{Arrays.asList(1, 10), true});
tests.add(new Object[]{Arrays.asList(1, 10, 30), true});
tests.add(new Object[]{Arrays.asList(10, 1, 30), false});
tests.add(new Object[]{Arrays.asList(-1, 1), false});
tests.add(new Object[]{Arrays.asList(1, null, 10), false});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BandPartitionData")
public void testMyData(final List<Integer> partitions, final boolean expectedGood) {
try {
GVCFWriter.parsePartitions(partitions);
Assert.assertTrue(expectedGood, "Expected to fail but didn't");
} catch ( Exception e ) {
Assert.assertTrue(! expectedGood, "Expected to succeed but failed with message " + e.getMessage());
}
}
}

View File

@ -44,81 +44,118 @@
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
package org.broadinstitute.sting.utils.gvcf;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 2/8/13
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class DeBruijnAssemblyGraphUnitTest {
private class GetReferenceBytesTestProvider extends BaseTest.TestDataProvider {
public byte[] refSequence;
public byte[] altSequence;
public int KMER_LENGTH;
public class HomRefBlockUnitTest extends BaseTest {
VariantContext vc;
public GetReferenceBytesTestProvider(String ref, String alt, int kmer) {
super(GetReferenceBytesTestProvider.class, String.format("Testing reference bytes. kmer = %d, ref = %s, alt = %s", kmer, ref, alt));
refSequence = ref.getBytes();
altSequence = alt.getBytes();
KMER_LENGTH = kmer;
}
@BeforeMethod
public void setUp() throws Exception {
vc = new VariantContextBuilder("foo", "20", 1, 1, Arrays.asList(Allele.create("A", true), Allele.create("C"))).make();
}
public byte[] expectedReferenceBytes() {
return refSequence;
}
@Test
public void testBasicConstruction() {
final HomRefBlock band = new HomRefBlock(vc, 10, 20);
Assert.assertSame(band.getStartingVC(), vc);
Assert.assertEquals(band.getRef(), vc.getReference());
Assert.assertEquals(band.getGQLowerBound(), 10);
Assert.assertEquals(band.getGQUpperBound(), 20);
Assert.assertEquals(band.withinBounds(1), false);
Assert.assertEquals(band.withinBounds(10), true);
Assert.assertEquals(band.withinBounds(11), true);
Assert.assertEquals(band.withinBounds(20), false);
Assert.assertEquals(band.withinBounds(21), false);
}
public byte[] calculatedReferenceBytes() {
DeBruijnGraph graph = new DeBruijnGraph();
graph.addSequenceToGraph(refSequence, KMER_LENGTH, true);
if( altSequence.length > 0 ) {
graph.addSequenceToGraph(altSequence, KMER_LENGTH, false);
@Test
public void testMinMedian() {
final HomRefBlock band = new HomRefBlock(vc, 10, 20);
final GenotypeBuilder gb = new GenotypeBuilder("NA12878");
int pos = vc.getStart();
band.add(pos++, gb.DP(10).GQ(11).make());
Assert.assertEquals(band.getStop(), pos - 1);
assertValues(band, 10, 10, 11, 11);
band.add(pos++, gb.DP(11).GQ(10).make());
Assert.assertEquals(band.getStop(), pos - 1);
assertValues(band, 10, 11, 10, 11);
band.add(pos++, gb.DP(12).GQ(12).make());
Assert.assertEquals(band.getStop(), pos - 1);
assertValues(band, 10, 11, 10, 11);
band.add(pos++, gb.DP(13).GQ(15).make());
Assert.assertEquals(band.getStop(), pos - 1);
band.add(pos++, gb.DP(14).GQ(16).make());
Assert.assertEquals(band.getStop(), pos - 1);
band.add(pos++, gb.DP(15).GQ(17).make());
Assert.assertEquals(band.getStop(), pos - 1);
band.add(pos++, gb.DP(16).GQ(18).make());
Assert.assertEquals(band.getStop(), pos - 1);
assertValues(band, 10, 13, 10, 15);
Assert.assertEquals(band.getSize(), pos - vc.getStart());
}
@Test
public void testBigGQIsCapped() {
final HomRefBlock band = new HomRefBlock(vc, 10, 20);
final GenotypeBuilder gb = new GenotypeBuilder("NA12878");
band.add(vc.getStart(), gb.DP(1000).GQ(1000).make());
assertValues(band, 1000, 1000, 99, 99);
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testBadAdd() {
final HomRefBlock band = new HomRefBlock(vc, 10, 20);
final GenotypeBuilder gb = new GenotypeBuilder("NA12878");
band.add(vc.getStart() + 10, gb.DP(10).GQ(11).make());
}
private void assertValues(final HomRefBlock band, final int minDP, final int medianDP, final int minGQ, final int medianGQ) {
Assert.assertEquals(band.getMinDP(), minDP);
Assert.assertEquals(band.getMedianDP(), medianDP);
Assert.assertEquals(band.getMinGQ(), minGQ);
Assert.assertEquals(band.getMedianGQ(), medianGQ);
}
@DataProvider(name = "ContiguousData")
public Object[][] makeContiguousData() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final String chrMod : Arrays.asList("", ".mismatch") ) {
for ( final int offset : Arrays.asList(-10, -1, 0, 1, 10) ) {
final boolean equals = chrMod.equals("") && offset == 0;
tests.add(new Object[]{vc.getChr() + chrMod, vc.getStart() + offset, equals});
}
return graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true);
}
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "GetReferenceBytesTestProvider")
public Object[][] GetReferenceBytesTests() {
new GetReferenceBytesTestProvider("GGTTAACC", "", 3);
new GetReferenceBytesTestProvider("GGTTAACC", "", 4);
new GetReferenceBytesTestProvider("GGTTAACC", "", 5);
new GetReferenceBytesTestProvider("GGTTAACC", "", 6);
new GetReferenceBytesTestProvider("GGTTAACC", "", 7);
new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "", 6);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "", 66);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "", 76);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 3);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 4);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 5);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 6);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 7);
new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 3);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 4);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 5);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 6);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 7);
new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "AAAAAAAAAAAAA", 6);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 66);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 76);
return GetReferenceBytesTestProvider.getTests(GetReferenceBytesTestProvider.class);
}
@Test(dataProvider = "GetReferenceBytesTestProvider", enabled = true)
public void testGetReferenceBytes(GetReferenceBytesTestProvider cfg) {
Assert.assertEquals(cfg.calculatedReferenceBytes(), cfg.expectedReferenceBytes(), "Reference sequences do not match");
@Test(dataProvider = "ContiguousData")
public void testIsContiguous(final String contig, final int pos, final boolean expected) {
final HomRefBlock band = new HomRefBlock(vc, 10, 20);
final VariantContext testVC = new VariantContextBuilder(vc).chr(contig).start(pos).stop(pos).make();
Assert.assertEquals(band.isContiguous(testVC), expected);
}
}

View File

@ -81,26 +81,22 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest {
return hap;
}
private static class MockBAMWriter implements SAMFileWriter {
@Override
public void addAlignment(SAMRecord alignment) {
//To change body of implemented methods use File | Settings | File Templates.
private static class MockDestination extends ReadDestination {
private final static SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader();
private MockDestination() {
super(header, "foo");
}
@Override
public SAMFileHeader getFileHeader() {
return null; //To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void close() {
public void add(GATKSAMRecord read) {
//To change body of implemented methods use File | Settings | File Templates.
}
}
@Test
public void testCreate() throws Exception {
final SAMFileWriter writer = new MockBAMWriter();
final MockDestination writer = new MockDestination();
Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, writer) instanceof CalledHaplotypeBAMWriter);
Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.ALL_POSSIBLE_HAPLOTYPES, writer) instanceof AllHaplotypeBAMWriter);
}
@ -173,7 +169,7 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest {
@Test(dataProvider = "ReadAlignedToRefData", enabled = true)
public void testReadAlignedToRef(final GATKSAMRecord read, final Haplotype haplotype, final int refStart, final int expectedReadStart, final String expectedReadCigar) throws Exception {
final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter());
final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockDestination());
final GATKSAMRecord originalReadCopy = (GATKSAMRecord)read.clone();
if ( expectedReadCigar == null ) {
@ -289,7 +285,7 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest {
@Test(dataProvider = "ComplexReadAlignedToRef", enabled = !DEBUG)
public void testReadAlignedToRefComplexAlignment(final int testIndex, final GATKSAMRecord read, final String reference, final Haplotype haplotype, final int expectedMaxMismatches) throws Exception {
final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter());
final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockDestination());
final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, 1, true);
if ( alignedRead != null ) {
final int mismatches = AlignmentUtils.getMismatchCount(alignedRead, reference.getBytes(), alignedRead.getAlignmentStart() - 1).numMismatches;

View File

@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
for ( final int nct : Arrays.asList(1, 2) ) {
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
tests.add(new Object[]{ "BOTH", "aad3a398273ec795e363268997247bd8", nt, nct });
tests.add(new Object[]{ "BOTH", "a80925b58735828158491f77ae64998b", nt, nct });
}
return tests.toArray(new Object[][]{});

View File

@ -56,6 +56,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -68,11 +69,18 @@ public class PairHMMUnitTest extends BaseTest {
private final static boolean ALLOW_READS_LONGER_THAN_HAPLOTYPE = true;
private final static boolean DEBUG = false;
final static boolean EXTENSIVE_TESTING = true;
final PairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation
final PairHMM originalHMM = new Log10PairHMM(false); // the reference implementation
final PairHMM loglessHMM = new LoglessPairHMM();
final N2MemoryPairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation
final N2MemoryPairHMM originalHMM = new Log10PairHMM(false); // the reference implementation
final N2MemoryPairHMM loglessHMM = new LoglessPairHMM();
private List<PairHMM> getHMMs() {
@BeforeClass
public void initialize() {
exactHMM.doNotUseTristateCorrection();
originalHMM.doNotUseTristateCorrection();
loglessHMM.doNotUseTristateCorrection();
}
private List<N2MemoryPairHMM> getHMMs() {
return Arrays.asList(exactHMM, originalHMM, loglessHMM);
}
@ -592,8 +600,13 @@ public class PairHMMUnitTest extends BaseTest {
public Object[][] makeUninitializedHMMs() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{new LoglessPairHMM()});
tests.add(new Object[]{new Log10PairHMM(true)});
final LoglessPairHMM myLoglessPairHMM = new LoglessPairHMM();
myLoglessPairHMM.doNotUseTristateCorrection();
tests.add(new Object[]{myLoglessPairHMM});
final Log10PairHMM myLog10PairHMM = new Log10PairHMM(true);
myLog10PairHMM.doNotUseTristateCorrection();
tests.add(new Object[]{myLog10PairHMM});
return tests.toArray(new Object[][]{});
}

View File

@ -128,10 +128,12 @@ public class GATKArgumentCollection {
@Argument(fullName = "downsample_to_coverage", shortName = "dcov",
doc = "Coverage [integer] to downsample to. For locus-based traversals (eg., LocusWalkers and ActiveRegionWalkers)," +
"this controls the maximum depth of coverage at each locus. For non-locus-based traversals (eg., ReadWalkers), " +
"this controls the maximum number of reads sharing the same alignment start position. Note that the " +
"coverage target is an approximate goal that is not guaranteed to be met exactly: the GATK's approach " +
"to downsampling is based on even representation of reads from all alignment start positions, and the " +
"downsampling algorithm will under some circumstances retain slightly more coverage than requested.",
"this controls the maximum number of reads sharing the same alignment start position. Note that this downsampling " +
"option does NOT produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of " +
"the to-coverage downsampler is to maintain an even representation of reads from all alignment start positions " +
"when removing excess coverage. For a true across-the-board unbiased random sampling of reads, use -dfrac instead. " +
"Also note that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling " +
"algorithm will under some circumstances retain slightly more coverage than requested.",
required = false)
public Integer downsampleCoverage = null;

View File

@ -115,10 +115,10 @@ public class CountingFilteringIterator implements CloseableIterator<SAMRecord> {
public void close() {
CloserUtil.close(iterator);
for ( final CountingReadFilter filter : filters )
privateRuntimeMetrics.setFilterCount(filter.readFilter.getClass().getSimpleName(), filter.counter);
// update the global metrics with all the data we collected here
globalRuntimeMetrics.incrementMetrics(privateRuntimeMetrics);
for ( final CountingReadFilter filter : filters )
globalRuntimeMetrics.setFilterCount(filter.readFilter.getClass().getSimpleName(), filter.counter);
}
/**

View File

@ -136,7 +136,7 @@ public final class VariantOverlapAnnotator {
VariantContext annotated = vcToAnnotate;
final GenomeLoc loc = getLoc(vcToAnnotate);
for ( final Map.Entry<RodBinding<VariantContext>, String> overlapBinding : overlapBindings.entrySet() ) {
annotated = annotateOverlap(tracker.getValues(overlapBinding.getKey(), loc), overlapBinding.getValue(), vcToAnnotate);
annotated = annotateOverlap(tracker.getValues(overlapBinding.getKey(), loc), overlapBinding.getValue(), annotated);
}
return annotated;

View File

@ -43,7 +43,6 @@ public class DeprecatedToolChecks {
static {
// Indicate recommended replacement in parentheses if applicable
deprecatedGATKWalkers.put("CountCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)");
deprecatedGATKWalkers.put("AnalyzeCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)");
deprecatedGATKWalkers.put("TableRecalibration", "2.0 (use PrintReads with -BQSR instead; see documentation for usage)");
deprecatedGATKWalkers.put("AlignmentWalker", "2.2 (no replacement)");
deprecatedGATKWalkers.put("CountBestAlignments", "2.2 (no replacement)");

View File

@ -560,24 +560,21 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
/**
* return a new genome loc, with an incremented position
*
* @param loc the old location
*
* @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + 1
*/
public GenomeLoc incPos(GenomeLoc loc) {
return incPos(loc, 1);
public GenomeLoc incPos() {
return incPos(1);
}
/**
* return a new genome loc, with an incremented position
*
* @param loc the old location
* @param by how much to move the start and stop by
*
* @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + by
*/
public GenomeLoc incPos(GenomeLoc loc, int by) {
return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by);
public GenomeLoc incPos(int by) {
return new GenomeLoc(getContig(), getContigIndex(), start + by, stop + by);
}
/**

View File

@ -831,6 +831,36 @@ public class MathUtils {
return array[minElementIndex(array)];
}
/**
* Compute the min element of a List<Integer>
* @param array a non-empty list of integer
* @return the min
*/
public static int arrayMin(final List<Integer> array) {
if ( array == null || array.isEmpty() ) throw new IllegalArgumentException("Array must be non-null and non-empty");
int min = array.get(0);
for ( final int i : array )
if ( i < min ) min = i;
return min;
}
/**
* Compute the median element of the array of integers
* @param array a list of integers
* @return the median element
*/
public static int median(final List<Integer> array) {
if ( array == null ) throw new IllegalArgumentException("Array must be non-null");
final int size = array.size();
if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0");
else if ( size == 1 ) return array.get(0);
else {
final ArrayList<Integer> sorted = new ArrayList<>(array);
Collections.sort(sorted);
return sorted.get(size / 2);
}
}
public static int minElementIndex(final double[] array) {
if (array == null || array.length == 0)
throw new IllegalArgumentException("Array cannot be null!");

View File

@ -147,6 +147,13 @@ public class ActiveRegion implements HasGenomeLocation {
}
}
/**
* Simple interface to create an active region that isActive without any profile state
*/
public ActiveRegion( final GenomeLoc activeRegionLoc, final GenomeLocParser genomeLocParser, final int extension ) {
this(activeRegionLoc, Collections.<ActivityProfileState>emptyList(), true, genomeLocParser, extension);
}
@Override
public String toString() {
return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size();

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.fragments;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.util.QualityUtil;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
@ -60,6 +61,11 @@ import java.util.*;
* Time: 10:09 PM
*/
public final class FragmentUtils {
public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;
public final static int DEFAULT_PCR_ERROR_QUAL = QualityUtil.getPhredScoreFromErrorProbability(DEFAULT_PCR_ERROR_RATE);
public final static int HALF_OF_DEFAULT_PCR_ERROR_QUAL = DEFAULT_PCR_ERROR_QUAL / 2;
protected final static byte MIN_QUAL_BAD_OVERLAP = 16;
private FragmentUtils() {} // private constructor
@ -189,6 +195,70 @@ public final class FragmentUtils {
return create(reads, reads.size(), SamRecordGetter);
}
public static void adjustQualsOfOverlappingPairedFragments( final List<GATKSAMRecord> overlappingPair ) {
if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); }
final GATKSAMRecord firstRead = overlappingPair.get(0);
final GATKSAMRecord secondRead = overlappingPair.get(1);
if ( secondRead.getSoftStart() < firstRead.getSoftStart() ) {
adjustQualsOfOverlappingPairedFragments(secondRead, firstRead);
} else {
adjustQualsOfOverlappingPairedFragments(firstRead, secondRead);
}
}
/**
* Merge two overlapping reads from the same fragment into a single super read, if possible
*
* firstRead and secondRead must be part of the same fragment (though this isn't checked). Looks
* at the bases and alignment, and tries its best to create a meaningful synthetic single super read
* that represents the entire sequenced fragment.
*
* Assumes that firstRead starts before secondRead (according to their soft clipped starts)
*
* @param clippedFirstRead the left most read
* @param clippedSecondRead the right most read
*
* @return a strandless merged read of first and second, or null if the algorithm cannot create a meaningful one
*/
public static void adjustQualsOfOverlappingPairedFragments(final GATKSAMRecord clippedFirstRead, final GATKSAMRecord clippedSecondRead) {
if ( clippedFirstRead == null ) throw new IllegalArgumentException("clippedFirstRead cannot be null");
if ( clippedSecondRead == null ) throw new IllegalArgumentException("clippedSecondRead cannot be null");
if ( ! clippedFirstRead.getReadName().equals(clippedSecondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + clippedFirstRead + " and " + clippedSecondRead);
// don't adjust fragments that do not overlap
if ( clippedFirstRead.getAlignmentEnd() < clippedSecondRead.getAlignmentStart() || clippedFirstRead.getReferenceIndex() != clippedSecondRead.getReferenceIndex() )
return;
final Pair<Integer, Boolean> pair = ReadUtils.getReadCoordinateForReferenceCoordinate(clippedFirstRead, clippedSecondRead.getAlignmentStart());
final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() );
final int numOverlappingBases = Math.min(clippedFirstRead.getReadLength() - firstReadStop, clippedSecondRead.getReadLength());
final byte[] firstReadBases = clippedFirstRead.getReadBases();
final byte[] firstReadQuals = clippedFirstRead.getBaseQualities();
final byte[] secondReadBases = clippedSecondRead.getReadBases();
final byte[] secondReadQuals = clippedSecondRead.getBaseQualities();
for ( int i = 0; i < numOverlappingBases; i++ ) {
final int firstReadIndex = firstReadStop + i;
final byte firstReadBase = firstReadBases[firstReadIndex];
final byte secondReadBase = secondReadBases[i];
if ( firstReadBase == secondReadBase ) {
firstReadQuals[firstReadIndex] = (byte) Math.min(firstReadQuals[firstReadIndex], HALF_OF_DEFAULT_PCR_ERROR_QUAL);
secondReadQuals[i] = (byte) Math.min(secondReadQuals[i], HALF_OF_DEFAULT_PCR_ERROR_QUAL);
} else {
// TODO -- use the proper statistical treatment of the quals from DiploidSNPGenotypeLikelihoods.java
firstReadQuals[firstReadIndex] = 0;
secondReadQuals[i] = 0;
}
}
clippedFirstRead.setBaseQualities(firstReadQuals);
clippedSecondRead.setBaseQualities(secondReadQuals);
}
public static List<GATKSAMRecord> mergeOverlappingPairedFragments( final List<GATKSAMRecord> overlappingPair ) {
if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); }

View File

@ -33,8 +33,6 @@ package org.broadinstitute.sting.utils.locusiterator;
* Time: 1:26 PM
*/
class LIBSDownsamplingInfo {
public final static LIBSDownsamplingInfo NO_DOWNSAMPLING = new LIBSDownsamplingInfo(false, -1);
final private boolean performDownsampling;
final private int toCoverage;

View File

@ -71,6 +71,9 @@ import java.util.*;
* a stream of unique, sorted reads
*/
public final class LocusIteratorByState extends LocusIterator {
/** Indicates that we shouldn't do any downsampling */
public final static LIBSDownsamplingInfo NO_DOWNSAMPLING = new LIBSDownsamplingInfo(false, -1);
/**
* our log, which we want to capture anything from this class
*/
@ -174,12 +177,12 @@ public final class LocusIteratorByState extends LocusIterator {
* @param maintainUniqueReadsList if true, we will keep the unique reads from off the samIterator and make them
* available via the transferReadsFromAllPreviousPileups interface
*/
protected LocusIteratorByState(final Iterator<GATKSAMRecord> samIterator,
final LIBSDownsamplingInfo downsamplingInfo,
final boolean includeReadsWithDeletionAtLoci,
final GenomeLocParser genomeLocParser,
final Collection<String> samples,
final boolean maintainUniqueReadsList) {
public LocusIteratorByState(final Iterator<GATKSAMRecord> samIterator,
final LIBSDownsamplingInfo downsamplingInfo,
final boolean includeReadsWithDeletionAtLoci,
final GenomeLocParser genomeLocParser,
final Collection<String> samples,
final boolean maintainUniqueReadsList) {
if ( samIterator == null ) throw new IllegalArgumentException("samIterator cannot be null");
if ( downsamplingInfo == null ) throw new IllegalArgumentException("downsamplingInfo cannot be null");
if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
@ -252,9 +255,15 @@ public final class LocusIteratorByState extends LocusIterator {
* Will return null if cannot reach position (because we run out of data in the locus)
*
* @param position the start position of the AlignmentContext we want back
* @param stopAtFirstNonEmptySiteAfterPosition if true, we will stop as soon as we find a context with data with
* position >= position, otherwise we will return a null value
* and consume the data for the next position. This means that without
* specifying this value the LIBS will be in an indeterminate state
* after calling this function, and should be reconstructed from scratch
* for subsequent use
* @return a AlignmentContext at position, or null if this isn't possible
*/
public AlignmentContext advanceToLocus(final int position) {
public AlignmentContext advanceToLocus(final int position, final boolean stopAtFirstNonEmptySiteAfterPosition) {
while ( hasNext() ) {
final AlignmentContext context = next();
@ -262,8 +271,11 @@ public final class LocusIteratorByState extends LocusIterator {
// we ran out of data
return null;
if ( context.getPosition() == position)
if ( context.getPosition() == position )
return context;
if ( context.getPosition() > position)
return stopAtFirstNonEmptySiteAfterPosition ? context : null;
}
return null;

View File

@ -32,6 +32,8 @@ import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
import static java.lang.Math.log10;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
*
@ -51,6 +53,9 @@ public final class Log10PairHMM extends N2MemoryPairHMM {
private static final int matchToDeletion = 4;
private static final int deletionToDeletion = 5;
// we divide e by 3 because the observed base could have come from any of the non-observed alleles
protected final static double log10_3 = log10(3.0);
/**
* Create an uninitialized PairHMM
*
@ -148,7 +153,7 @@ public final class Log10PairHMM extends N2MemoryPairHMM {
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
QualityUtils.qualToProbLog10(qual) : (QualityUtils.qualToErrorProbLog10(qual) - (doNotUseTristateCorrection ? 0.0 : log10_3)) );
}
}
}

View File

@ -44,6 +44,10 @@ abstract class N2MemoryPairHMM extends PairHMM {
protected double[][] insertionMatrix = null;
protected double[][] deletionMatrix = null;
// only used for debugging purposes
protected boolean doNotUseTristateCorrection = false;
protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; }
/**
* Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths
*

View File

@ -30,10 +30,7 @@ import com.google.java.contract.Requires;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -214,34 +211,52 @@ public class ReadUtils {
* CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig.
*/
public static int getAdaptorBoundary(final SAMRecord read) {
final int MAXIMUM_ADAPTOR_LENGTH = 8;
final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value)
if (insertSize == 0 || read.getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or unmapped pairs
return CANNOT_COMPUTE_ADAPTOR_BOUNDARY;
if ( read.getReadPairedFlag() && read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() ) {
// note that the read.getProperPairFlag() is not reliably set, so many reads may have this tag but still be overlapping
// logger.info(String.format("Read %s start=%d end=%d insert=%d mateStart=%d readNeg=%b mateNeg=%b not properly paired, returning CANNOT_COMPUTE_ADAPTOR_BOUNDARY",
// read.getReadName(), read.getAlignmentStart(), read.getAlignmentEnd(), insertSize, read.getMateAlignmentStart(),
// read.getReadNegativeStrandFlag(), read.getMateNegativeStrandFlag()));
if ( ! hasWellDefinedFragmentSize(read) ) {
return CANNOT_COMPUTE_ADAPTOR_BOUNDARY;
} else if ( read.getReadNegativeStrandFlag() ) {
return read.getMateAlignmentStart() - 1; // case 1 (see header)
} else {
final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value)
return read.getAlignmentStart() + insertSize + 1; // case 2 (see header)
}
int adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read)
if (read.getReadNegativeStrandFlag())
adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header)
else
adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header)
if ( (adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH) || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH) )
adaptorBoundary = CANNOT_COMPUTE_ADAPTOR_BOUNDARY; // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor
return adaptorBoundary;
}
public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE;
/**
* Can the adaptor sequence of read be reliably removed from the read based on the alignment of
* read and its mate?
*
* @param read the read to check
* @return true if it can, false otherwise
*/
public static boolean hasWellDefinedFragmentSize(final SAMRecord read) {
if ( read.getInferredInsertSize() == 0 )
// no adaptors in reads with mates in another chromosome or unmapped pairs
return false;
if ( ! read.getReadPairedFlag() )
// only reads that are paired can be adaptor trimmed
return false;
if ( read.getReadUnmappedFlag() || read.getMateUnmappedFlag() )
// only reads when both reads are mapped can be trimmed
return false;
// if ( ! read.getProperPairFlag() )
// // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs
// // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed
// return false;
if ( read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() )
// sanity check on getProperPairFlag to ensure that read1 and read2 aren't on the same strand
return false;
if ( read.getReadNegativeStrandFlag() ) {
// we're on the negative strand, so our read runs right to left
return read.getAlignmentEnd() > read.getMateAlignmentStart();
} else {
// we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start)
return read.getAlignmentStart() <= read.getMateAlignmentStart() + read.getInferredInsertSize();
}
}
/**
* is the read a 454 read?
*

View File

@ -69,10 +69,20 @@ public class SWPairwiseAlignment implements SmithWaterman {
* Add softclips for the overhangs
*/
SOFTCLIP,
/*
* Treat the overhangs as proper insertions/deletions
*/
INDEL,
/*
* Treat the overhangs as proper insertions/deletions for leading (but not trailing) overhangs.
* This is useful e.g. when we want to merge dangling tails in an assembly graph: because we don't
* expect the dangling tail to reach the end of the reference path we are okay ignoring trailing
* deletions - but leading indels are still very much relevant.
*/
LEADING_INDEL,
/*
* Just ignore the overhangs
*/
@ -125,10 +135,11 @@ public class SWPairwiseAlignment implements SmithWaterman {
*
* @param seq1 the first sequence we want to align
* @param seq2 the second sequence we want to align
* @param parameters the SW parameters to use
* @param strategy the overhang strategy to use
*/
public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final OVERHANG_STRATEGY strategy) {
this(SWParameterSet.ORIGINAL_DEFAULT.parameters);
public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final SWParameterSet parameters, final OVERHANG_STRATEGY strategy) {
this(parameters.parameters);
overhang_strategy = strategy;
align(seq1, seq2);
}
@ -226,7 +237,7 @@ public class SWPairwiseAlignment implements SmithWaterman {
final int[] gap_size_h = new int[n+1];
// we need to initialize the SW matrix with gap penalties if we want to keep track of indels at the edges of alignments
if ( overhang_strategy == OVERHANG_STRATEGY.INDEL ) {
if ( overhang_strategy == OVERHANG_STRATEGY.INDEL || overhang_strategy == OVERHANG_STRATEGY.LEADING_INDEL ) {
// initialize the first row
sw[1] = parameters.w_open;
double currentValue = parameters.w_open;
@ -371,7 +382,7 @@ public class SWPairwiseAlignment implements SmithWaterman {
p1 = refLength;
p2 = altLength;
} else {
// look for largest score. we use >= combined with the traversal direction
// look for the largest score on the rightmost column. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1, data_offset = altLength+1+altLength ; i < refLength+1 ; i++, data_offset += (altLength+1) ) {
// data_offset is the offset of [i][m]
@ -380,18 +391,21 @@ public class SWPairwiseAlignment implements SmithWaterman {
}
}
for ( int j = 1, data_offset = refLength*(altLength+1)+1 ; j < altLength+1 ; j++, data_offset++ ) {
// data_offset is the offset of [n][j]
if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(refLength-j) < Math.abs(p1 - p2)) {
p1 = refLength;
p2 = j ;
maxscore = sw[data_offset];
segment_length = altLength - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
// now look for a larger score on the bottom-most row
if ( overhang_strategy != OVERHANG_STRATEGY.LEADING_INDEL ) {
for ( int j = 1, data_offset = refLength*(altLength+1)+1 ; j < altLength+1 ; j++, data_offset++ ) {
// data_offset is the offset of [n][j]
if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(refLength-j) < Math.abs(p1 - p2)) {
p1 = refLength;
p2 = j ;
maxscore = sw[data_offset];
segment_length = altLength - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
}
}
List<CigarElement> lce = new ArrayList<CigarElement>(5);
final List<CigarElement> lce = new ArrayList<CigarElement>(5);
if ( segment_length > 0 && overhang_strategy == OVERHANG_STRATEGY.SOFTCLIP ) {
lce.add(makeElement(State.CLIP, segment_length));
@ -452,7 +466,7 @@ public class SWPairwiseAlignment implements SmithWaterman {
} else if ( overhang_strategy == OVERHANG_STRATEGY.IGNORE ) {
lce.add(makeElement(state, segment_length + p2));
alignment_offset = p1 - p2;
} else { // overhang_strategy == OVERHANG_STRATEGY.INDEL
} else { // overhang_strategy == OVERHANG_STRATEGY.INDEL || overhang_strategy == OVERHANG_STRATEGY.LEADING_INDEL
// take care of the actual alignment
lce.add(makeElement(state, segment_length));

View File

@ -131,6 +131,11 @@ public abstract class BaseTest {
public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta";
public final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam";
public final static String NA12878_WEx = privateTestDir + "CEUTrio.HiSeq.WEx.b37_decoy.NA12878.20_10_11mb.bam";
public static final boolean pipelineTestRunModeIsSet = System.getProperty("pipeline.run").equals("run");
/** before the class starts up */
static {
// setup a basic log configuration

View File

@ -29,6 +29,7 @@ import org.apache.commons.io.FileUtils;
import org.broadinstitute.sting.BaseTest;
import org.ggf.drmaa.*;
import org.testng.Assert;
import org.testng.SkipException;
import org.testng.annotations.Test;
import java.io.File;
@ -51,19 +52,23 @@ public class JnaSessionPipelineTest extends BaseTest {
@Test(dependsOnMethods = { "testDrmaa" })
public void testSubmitEcho() throws Exception {
if ( ! pipelineTestRunModeIsSet ) {
throw new SkipException("Skipping testSubmitEcho because we are in pipeline test dry run mode");
}
if (implementation.contains("LSF")) {
System.err.println(" ***********************************************************");
System.err.println(" *************************************************************");
System.err.println(" **** ****");
System.err.println(" **** Skipping JnaSessionIntegrationTest.testSubmitEcho() ****");
System.err.println(" **** Skipping JnaSessionPipelineTest.testSubmitEcho() ****");
System.err.println(" **** Are you using the dotkit .combined_LSF_SGE? ****");
System.err.println(" **** ****");
System.err.println(" *************************************************************");
System.err.println(" ***********************************************************");
return;
throw new SkipException("Skipping testSubmitEcho because correct DRMAA implementation not found");
}
File outFile = tryCreateNetworkTempFile("JnaSessionIntegrationTest.out");
File outFile = tryCreateNetworkTempFile("JnaSessionPipelineTest.out");
Session session = factory.getSession();
session.init(null);
try {

View File

@ -34,6 +34,7 @@ import com.sun.jna.ptr.PointerByReference;
import org.apache.commons.io.FileUtils;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.SkipException;
import org.testng.annotations.Test;
import java.io.File;
@ -87,22 +88,26 @@ public class LibDrmaaPipelineTest extends BaseTest {
@Test(dependsOnMethods = { "testDrmaa" })
public void testSubmitEcho() throws Exception {
if ( ! pipelineTestRunModeIsSet ) {
throw new SkipException("Skipping testSubmitEcho because we are in pipeline test dry run mode");
}
if (implementation.contains("LSF")) {
System.err.println(" *********************************************************");
System.err.println(" ***********************************************************");
System.err.println(" **** ****");
System.err.println(" **** Skipping LibDrmaaIntegrationTest.testSubmitEcho() ****");
System.err.println(" **** Skipping LibDrmaaPipelineTest.testSubmitEcho() ****");
System.err.println(" **** Are you using the dotkit .combined_LSF_SGE? ****");
System.err.println(" **** ****");
System.err.println(" ***********************************************************");
System.err.println(" *********************************************************");
return;
throw new SkipException("Skipping testSubmitEcho because correct DRMAA implementation not found");
}
Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER);
int errnum;
File outFile = tryCreateNetworkTempFile("LibDrmaaIntegrationTest.out");
File outFile = tryCreateNetworkTempFile("LibDrmaaPipelineTest.out");
errnum = LibDrmaa.drmaa_init(null, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN);

View File

@ -30,6 +30,7 @@ import com.sun.jna.ptr.IntByReference;
import org.apache.commons.io.FileUtils;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert;
import org.testng.SkipException;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import org.broadinstitute.sting.BaseTest;
@ -43,7 +44,7 @@ import java.io.File;
public class LibBatPipelineTest extends BaseTest {
@BeforeClass
public void initLibBat() {
Assert.assertFalse(LibBat.lsb_init("LibBatIntegrationTest") < 0, LibBat.lsb_sperror("lsb_init() failed"));
Assert.assertFalse(LibBat.lsb_init("LibBatPipelineTest") < 0, LibBat.lsb_sperror("lsb_init() failed"));
}
@Test
@ -93,15 +94,19 @@ public class LibBatPipelineTest extends BaseTest {
@Test
public void testSubmitEcho() throws Exception {
if ( ! pipelineTestRunModeIsSet ) {
throw new SkipException("Skipping testSubmitEcho because we are in pipeline test dry run mode");
}
String queue = "hour";
File outFile = tryCreateNetworkTempFile("LibBatIntegrationTest.out");
File outFile = tryCreateNetworkTempFile("LibBatPipelineTest.out");
submit req = new submit();
for (int i = 0; i < LibLsf.LSF_RLIM_NLIMITS; i++)
req.rLimits[i] = LibLsf.DEFAULT_RLIMIT;
req.projectName = "LibBatIntegrationTest";
req.projectName = "LibBatPipelineTest";
req.options |= LibBat.SUB_PROJECT_NAME;
req.queue = queue;

View File

@ -26,9 +26,11 @@
package org.broadinstitute.sting.utils;
import cern.jet.random.Normal;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
@ -47,7 +49,7 @@ public class MathUtilsUnitTest extends BaseTest {
@Test
public void testGenerateUniqueHashFromThreePositiveIntegers() {
logger.warn("Executing testGenerateUniqueHashFromThreePositiveIntegers");
final Set<Long> observedLongs = new HashSet<Long>();
for (short i = 0; i < Byte.MAX_VALUE; i++) {
for (short j = 0; j < Byte.MAX_VALUE; j++) {
@ -97,7 +99,7 @@ public class MathUtilsUnitTest extends BaseTest {
final int numTrials = 10;
for ( int i = 0; i < numTrials; i++ )
Assert.assertEquals(MathUtils.binomialCumulativeProbability(numTrials, i, i), MathUtils.binomialProbability(numTrials, i), 1e-10, String.format("k=%d, n=%d", i, numTrials));
Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 2), 0.05468750, 1e-7);
Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 5), 0.62304687, 1e-7);
Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 10), 1.0, 1e-7);
@ -271,7 +273,7 @@ public class MathUtilsUnitTest extends BaseTest {
@Test
public void testApproximateLog10SumLog10() {
final double requiredPrecision = 1E-4;
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, requiredPrecision);
@ -446,4 +448,74 @@ public class MathUtilsUnitTest extends BaseTest {
}
}
}
@DataProvider(name = "ArrayMinData")
public Object[][] makeArrayMinData() {
List<Object[]> tests = new ArrayList<Object[]>();
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{Arrays.asList(10), 10});
tests.add(new Object[]{Arrays.asList(-10), -10});
for ( final List<Integer> values : Utils.makePermutations(Arrays.asList(1,2,3), 3, false) ) {
tests.add(new Object[]{values, 1});
}
for ( final List<Integer> values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) {
tests.add(new Object[]{values, -3});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "ArrayMinData")
public void testArrayMinList(final List<Integer> values, final int expected) {
final int actual = MathUtils.arrayMin(values);
Assert.assertEquals(actual, expected, "Failed with " + values);
}
@Test(dataProvider = "ArrayMinData")
public void testArrayMinIntArray(final List<Integer> values, final int expected) {
final int[] asArray = ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()]));
final int actual = MathUtils.arrayMin(asArray);
Assert.assertEquals(actual, expected, "Failed with " + values);
}
@Test(dataProvider = "ArrayMinData")
public void testArrayMinByteArray(final List<Integer> values, final int expected) {
final byte[] asArray = new byte[values.size()];
for ( int i = 0; i < values.size(); i++ ) asArray[i] = (byte)(values.get(i) & 0xFF);
final byte actual = MathUtils.arrayMin(asArray);
Assert.assertEquals(actual, (byte)(expected & 0xFF), "Failed with " + values);
}
@Test(dataProvider = "ArrayMinData")
public void testArrayMinDoubleArray(final List<Integer> values, final int expected) {
final double[] asArray = new double[values.size()];
for ( int i = 0; i < values.size(); i++ ) asArray[i] = (double)(values.get(i));
final double actual = MathUtils.arrayMin(asArray);
Assert.assertEquals(actual, (double)expected, "Failed with " + values);
}
@DataProvider(name = "MedianData")
public Object[][] makeMedianData() {
List<Object[]> tests = new ArrayList<Object[]>();
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{Arrays.asList(10), 10});
tests.add(new Object[]{Arrays.asList(1, 10), 10});
for ( final List<Integer> values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) {
tests.add(new Object[]{values, 1});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "MedianData")
public void testMedian(final List<Integer> values, final int expected) {
final int actual = MathUtils.median(values);
Assert.assertEquals(actual, expected, "Failed with " + values);
}
}

View File

@ -224,7 +224,7 @@ public class FragmentUtilsUnitTest extends BaseTest {
}
@Test(enabled = !DEBUG, dataProvider = "MergeFragmentsTest")
public void testMergingTwoReads(final String name, final GATKSAMRecord read1, GATKSAMRecord read2, final GATKSAMRecord expectedMerged) {
public void testMergingTwoReads(final String name, final GATKSAMRecord read1, final GATKSAMRecord read2, final GATKSAMRecord expectedMerged) {
final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2);
if ( expectedMerged == null ) {
@ -253,17 +253,22 @@ public class FragmentUtilsUnitTest extends BaseTest {
final GATKSAMRecord expectedMerged = makeOverlappingRead("", 30, common, commonQuals, "", 30, 10);
read1.setCigarString("4S" + common.length() + "M");
read1.setProperPairFlag(true);
read1.setReadPairedFlag(true);
read1.setFirstOfPairFlag(true);
read1.setReadNegativeStrandFlag(true);
read1.setMateAlignmentStart(10);
read1.setMateNegativeStrandFlag(false);
read1.setMateAlignmentStart(read2.getAlignmentStart());
read2.setCigarString(common.length() + "M4S");
read2.setProperPairFlag(true);
read2.setReadPairedFlag(true);
read2.setFirstOfPairFlag(false);
read2.setReadNegativeStrandFlag(false);
read2.setMateNegativeStrandFlag(true);
read2.setMateAlignmentStart(read1.getAlignmentStart());
final int insertSize = common.length() - 1;
read1.setInferredInsertSize(insertSize);
read2.setInferredInsertSize(-insertSize);
read1.setInferredInsertSize(-insertSize);
read2.setInferredInsertSize(insertSize);
final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2);
Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString());
@ -344,4 +349,42 @@ public class FragmentUtilsUnitTest extends BaseTest {
read.setReadGroup(new GATKSAMReadGroupRecord("foo"));
return read;
}
private static final byte highQuality = 30;
private static final byte overlappingQuality = 20;
@DataProvider(name = "AdjustFragmentsTest")
public Object[][] createAdjustFragmentsTest() throws Exception {
List<Object[]> tests = new ArrayList<Object[]>();
final String leftFlank = "CCC";
final String rightFlank = "AAA";
final String allOverlappingBases = "ACGTACGTGGAACCTTAG";
for ( int overlapSize = 1; overlapSize < allOverlappingBases.length(); overlapSize++ ) {
final String overlappingBases = allOverlappingBases.substring(0, overlapSize);
final byte[] overlappingBaseQuals = new byte[overlapSize];
for ( int i = 0; i < overlapSize; i++ ) overlappingBaseQuals[i] = highQuality;
final GATKSAMRecord read1 = makeOverlappingRead(leftFlank, highQuality, overlappingBases, overlappingBaseQuals, "", highQuality, 1);
final GATKSAMRecord read2 = makeOverlappingRead("", highQuality, overlappingBases, overlappingBaseQuals, rightFlank, highQuality, leftFlank.length() + 1);
tests.add(new Object[]{read1, read2, overlapSize});
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = !DEBUG, dataProvider = "AdjustFragmentsTest")
public void testAdjustingTwoReads(final GATKSAMRecord read1, final GATKSAMRecord read2, final int overlapSize) {
FragmentUtils.adjustQualsOfOverlappingPairedFragments(read1, read2);
for ( int i = 0; i < read1.getReadLength() - overlapSize; i++ )
Assert.assertEquals(read1.getBaseQualities()[i], highQuality);
for ( int i = read1.getReadLength() - overlapSize; i < read1.getReadLength(); i++ )
Assert.assertEquals(read1.getBaseQualities()[i], overlappingQuality);
for ( int i = 0; i < overlapSize; i++ )
Assert.assertEquals(read2.getBaseQualities()[i], overlappingQuality);
for ( int i = overlapSize; i < read2.getReadLength(); i++ )
Assert.assertEquals(read2.getBaseQualities()[i], highQuality);
}
}

View File

@ -54,7 +54,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
private static final boolean DEBUG = false;
protected LocusIteratorByState li;
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testUnmappedAndAllIReadsPassThrough() {
final int readLength = 10;
GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength);
@ -697,24 +697,28 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
final List<Object[]> tests = new LinkedList<Object[]>();
final int start = 10;
// for ( final int goodBases : Arrays.asList(10) ) {
// for ( final int nClipsOnTheRight : Arrays.asList(0)) {
for ( final int goodBases : Arrays.asList(10, 20, 30) ) {
for ( final int nClips : Arrays.asList(0, 1, 2, 10)) {
for ( final boolean onLeft : Arrays.asList(true, false) ) {
final int readLength = nClips + goodBases;
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1" , 0, start, readLength);
read.setProperPairFlag(true);
read.setReadPairedFlag(true);
read.setReadUnmappedFlag(false);
read.setMateUnmappedFlag(false);
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
read.setBaseQualities(Utils.dupBytes((byte) '@', readLength));
read.setCigarString(readLength + "M");
if ( onLeft ) {
read.setReadNegativeStrandFlag(true);
read.setMateNegativeStrandFlag(false);
read.setMateAlignmentStart(start + nClips);
read.setInferredInsertSize(readLength);
tests.add(new Object[]{nClips, goodBases, 0, read});
} else {
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
read.setMateAlignmentStart(start - 1);
read.setInferredInsertSize(goodBases - 1);
tests.add(new Object[]{0, goodBases, nClips, read});
@ -723,29 +727,13 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
}
}
// for ( final int nClipsOnTheLeft : Arrays.asList(0, 1, 2, 10)) {
// final int readLength = nClipsOnTheLeft + goodBases;
// GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1" , 0, start, readLength);
// read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
// read.setBaseQualities(Utils.dupBytes((byte) '@', readLength));
// read.setCigarString(readLength + "M");
// read.setReadNegativeStrandFlag(true);
//
// read.setMateAlignmentStart(start + nClipsOnTheLeft);
// read.setInferredInsertSize(readLength);
//
// tests.add(new Object[]{nClipsOnTheLeft, goodBases, 0, read});
// }
// }
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "AdapterClippingTest")
// @Test(enabled = true, dataProvider = "LIBS_NotHoldingTooManyReads", timeOut = 100000)
public void testAdapterClipping(final int nClipsOnLeft, final int nReadContainingPileups, final int nClipsOnRight, final GATKSAMRecord read) {
li = new LocusIteratorByState(new FakeCloseableIterator<GATKSAMRecord>(Collections.singletonList(read).iterator()),
li = new LocusIteratorByState(new FakeCloseableIterator<>(Collections.singletonList(read).iterator()),
createTestReadProperties(DownsamplingMethod.NONE, false),
genomeLocParser,
LocusIteratorByState.sampleListForSAMWithoutReadGroups());
@ -755,10 +743,6 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
while ( li.hasNext() ) {
final AlignmentContext next = li.next();
Assert.assertEquals(next.getLocation().getStart(), expectedPos);
// if ( nPileups < nClipsOnLeft || nPileups > (nClipsOnLeft + nReadContainingPileups) )
// Assert.assertEquals(next.getBasePileup().getNumberOfElements(), 0, "Expected empty pileups when the read is in the adapter clipping zone at " + nPileups);
// else
// Assert.assertEquals(next.getBasePileup().getNumberOfElements(), 1, "Expected a pileups with 1 element when the read is within the good part of the read at " + nPileups);
nPileups++;
expectedPos++;
}

View File

@ -59,7 +59,7 @@ public class PerSampleReadStateManagerUnitTest extends LocusIteratorByStateBaseT
}
public void run() {
PerSampleReadStateManager perSampleReadStateManager = new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING);
PerSampleReadStateManager perSampleReadStateManager = new PerSampleReadStateManager(LocusIteratorByState.NO_DOWNSAMPLING);
makeReads();

View File

@ -66,6 +66,8 @@ public class ReadUtilsUnitTest extends BaseTest {
final byte[] quals = {30, 30, 30, 30, 30, 30, 30, 30};
final String cigar = "8M";
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar);
read.setProperPairFlag(true);
read.setReadPairedFlag(true);
read.setMateAlignmentStart(mateStart);
read.setInferredInsertSize(fragmentSize);
return read;
@ -85,6 +87,7 @@ public class ReadUtilsUnitTest extends BaseTest {
myStart = BEFORE;
read.setAlignmentStart(myStart);
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, myStart + fragmentSize + 1);
@ -93,6 +96,7 @@ public class ReadUtilsUnitTest extends BaseTest {
myStart = AFTER;
read.setAlignmentStart(myStart);
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, myStart + fragmentSize + 1);
@ -101,6 +105,7 @@ public class ReadUtilsUnitTest extends BaseTest {
myStart = AFTER;
read.setAlignmentStart(myStart);
read.setReadNegativeStrandFlag(true);
read.setMateNegativeStrandFlag(false);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, mateStart - 1);
@ -109,6 +114,7 @@ public class ReadUtilsUnitTest extends BaseTest {
myStart = BEFORE;
read.setAlignmentStart(myStart);
read.setReadNegativeStrandFlag(true);
read.setMateNegativeStrandFlag(false);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, mateStart - 1);
@ -116,9 +122,11 @@ public class ReadUtilsUnitTest extends BaseTest {
read = makeRead(fragmentSize, mateStart);
read.setInferredInsertSize(0);
read.setReadNegativeStrandFlag(true);
read.setMateNegativeStrandFlag(false);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY);
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY);
read.setInferredInsertSize(10);
@ -226,4 +234,91 @@ public class ReadUtilsUnitTest extends BaseTest {
final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL);
Assert.assertEquals(result, 3);
}
@DataProvider(name = "HasWellDefinedFragmentSizeData")
public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception {
final List<Object[]> tests = new LinkedList<Object[]>();
// setup a basic read that will work
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader();
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10);
read.setReadPairedFlag(true);
read.setProperPairFlag(true);
read.setReadUnmappedFlag(false);
read.setMateUnmappedFlag(false);
read.setAlignmentStart(100);
read.setCigarString("50M");
read.setMateAlignmentStart(130);
read.setInferredInsertSize(80);
read.setFirstOfPairFlag(true);
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
tests.add( new Object[]{ "basic case", read.clone(), true });
{
final GATKSAMRecord bad1 = (GATKSAMRecord)read.clone();
bad1.setReadPairedFlag(false);
tests.add( new Object[]{ "not paired", bad1, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setProperPairFlag(false);
// we currently don't require the proper pair flag to be set
tests.add( new Object[]{ "not proper pair", bad, true });
// tests.add( new Object[]{ "not proper pair", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setReadUnmappedFlag(true);
tests.add( new Object[]{ "read is unmapped", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setMateUnmappedFlag(true);
tests.add( new Object[]{ "mate is unmapped", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setMateNegativeStrandFlag(false);
tests.add( new Object[]{ "read and mate both on positive strand", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setReadNegativeStrandFlag(true);
tests.add( new Object[]{ "read and mate both on negative strand", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setInferredInsertSize(0);
tests.add( new Object[]{ "insert size is 0", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setAlignmentStart(1000);
tests.add( new Object[]{ "positve read starts after mate end", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setReadNegativeStrandFlag(true);
bad.setMateNegativeStrandFlag(false);
bad.setMateAlignmentStart(1000);
tests.add( new Object[]{ "negative strand read ends before mate starts", bad, false });
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "HasWellDefinedFragmentSizeData")
private void testHasWellDefinedFragmentSize(final String name, final GATKSAMRecord read, final boolean expected) {
Assert.assertEquals(ReadUtils.hasWellDefinedFragmentSize(read), expected);
}
}

View File

@ -0,0 +1,33 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.engine.pbsengine
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.queue.engine.drmaa.DrmaaJobManager
class PbsEngineJobManager extends DrmaaJobManager {
override def create(function: CommandLineFunction) = new PbsEngineJobRunner(session, function)
}

View File

@ -0,0 +1,100 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.engine.pbsengine
import org.broadinstitute.sting.queue.util.Logging
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.queue.engine.drmaa.DrmaaJobRunner
import org.ggf.drmaa.Session
/**
* Runs jobs on a PBS/Torque Engine compute cluster.
* NB - THIS FILE HAS BEEN MODIFIED from the original code
* of the GridEngine package
*/
class PbsEngineJobRunner(session: Session, function: CommandLineFunction) extends DrmaaJobRunner(session, function) with Logging {
// Pbs Engine disallows certain characters from being in job names.
// This replaces all illegal characters with underscores
protected override val jobNameFilter = """[\n\t\r/:,@\\*?]"""
protected override val minRunnerPriority = -1023
protected override val maxRunnerPriority = 0
override protected def functionNativeSpec = {
// create nativeSpec variable
var nativeSpec: String = ""
// If a project name is set specify the project name
if (function.jobProject != null)
nativeSpec += " -P " + function.jobProject
// If the job queue is set specify the job queue
if (function.jobQueue != null)
nativeSpec += " -q " + function.jobQueue
else
nativeSpec += " -q normal "
// If the resident set size is requested pass on the memory request
// mem_free is the standard, but may also be virtual_free or even not available
if (function.qSettings.residentRequestParameter != null && function.residentRequest.isDefined)
nativeSpec += " -l %s=%dM".format(function.qSettings.residentRequestParameter, function.residentRequest.map(_ * 1024).get.ceil.toInt)
// If the resident set size limit is defined specify the memory limit
if (function.residentLimit.isDefined)
nativeSpec += " -l mem=%dM".format(function.residentLimit.map(_ * 1024).get.ceil.toInt)
// If more than 1 core is requested, set the proper request
// the cores will be requested as part of a single node
if ( function.nCoresRequest.getOrElse(1) > 1 ) {
if ( function.qSettings.dontRequestMultipleCores )
logger.warn("Sending multicore job %s to farm without requesting appropriate number of cores (%d)".format(
function.shortDescription, function.nCoresRequest.get))
else
nativeSpec += " -l nodes=1:ppn=%d".format(function.nCoresRequest.get)
}
// Pass on any job resource requests
// NB: blank because resource requests in PBS can be preceded by different
// arguments, i.e. -l but also -o or -j if they are not exactly "resources" strictly speaking
// therefore the user will add them in the request, i.e. -jobResReq "-j oe"
// but this will allow more flexibility in setting the options for PBS jobs on different Clusters
nativeSpec += function.jobResourceRequests.map(" " + _).mkString
// Pass on any job environment names
nativeSpec += function.jobEnvironmentNames.map(" " + _).mkString
// If the priority is set specify the priority
val priority = functionPriority
if (priority.isDefined)
nativeSpec += " -p " + priority.get
logger.debug("Native spec is: %s".format(nativeSpec))
(nativeSpec + " " + super.functionNativeSpec).trim()
}
}

View File

@ -45,13 +45,14 @@ object PipelineTest extends BaseTest with Logging {
private val validationReportsDataLocation = "/humgen/gsa-hpprojects/GATK/validationreports/submitted/"
private val md5DB = new MD5DB()
final val run = System.getProperty("pipeline.run") == "run"
final val allJobRunners = {
val commandLinePluginManager = new CommandLinePluginManager
commandLinePluginManager.getPlugins.map(commandLinePluginManager.getName(_)).toSeq
}
/**
* All the job runners configured to run PipelineTests at The Broad.
*/
final val allJobRunners = Seq("Lsf706", "GridEngine", "Shell")
/**
* The default job runners to run.
*/
final val defaultJobRunners = Seq("Lsf706", "GridEngine")
/**
@ -100,7 +101,7 @@ object PipelineTest extends BaseTest with Logging {
Assert.fail("PipelineTestSpec.name is null")
println(Utils.dupString('-', 80));
executeTest(name, pipelineTest.args, pipelineTest.jobQueue, pipelineTest.expectedException, jobRunner)
if (run) {
if (BaseTest.pipelineTestRunModeIsSet) {
assertMatchingMD5s(name, pipelineTest.fileMD5s.map{case (file, md5) => new File(runDir(name, jobRunner), file) -> md5}, pipelineTest.parameterize)
if (pipelineTest.evalSpec != null)
validateEval(name, pipelineTest.evalSpec, jobRunner)
@ -169,7 +170,7 @@ object PipelineTest extends BaseTest with Logging {
if (jobQueue != null)
command = Utils.appendArray(command, "-jobQueue", jobQueue)
if (run)
if (BaseTest.pipelineTestRunModeIsSet)
command = Utils.appendArray(command, "-run")
// run the executable

View File

@ -30,7 +30,9 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
import org.broadinstitute.sting.BaseTest
class ExampleRetryMemoryLimitPipelineTest {
@Test(timeOut=36000000)
// This test is currently disabled due to unexplained intermittent failures (see GSA-943)
@Test(timeOut=36000000,enabled = false)
def testRetryMemoryLimit() {
val spec = new PipelineTestSpec
spec.name = "RetryMemoryLimit"

View File

@ -0,0 +1,3 @@
chr1:1-100
chr1:200-400
chr1:450-459