Merge pull request #329 from broadinstitute/eb_more_sensitivity_improvements_to_the_HC

A whole slew of improvements to the Haplotype Caller and related code.
This commit is contained in:
delangel 2013-07-12 10:37:43 -07:00
commit 7ddf85c040
35 changed files with 347 additions and 1025 deletions

View File

@ -54,8 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@ -147,7 +145,7 @@ public class StandardCallerArgumentCollection {
*/
@Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
public static final double DEFAULT_CONTAMINATION_FRACTION = 0.05;
public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0;
/**
* This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples.

View File

@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.genotyper.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -94,7 +95,7 @@ import static java.lang.Math.pow;
*/
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;
public final static double DEFAULT_PCR_ERROR_RATE = FragmentUtils.DEFAULT_PCR_ERROR_RATE;
protected final static int FIXED_PLOIDY = 2;
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;

View File

@ -1,263 +0,0 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
/**
* DeBruijn assembler for the HaplotypeCaller
*
* User: ebanks, rpoplin
* Date: Mar 14, 2011
*/
public class DeBruijnAssembler extends LocalAssemblyEngine {
private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);
// TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should
// TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where
// TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases
private final static int NUM_PATHS_PER_GRAPH = 25;
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int GRAPH_KMER_STEP = 6;
private static final int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
private final int minKmer;
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
protected DeBruijnAssembler() {
this(25, -1);
}
public DeBruijnAssembler(final int minKmer, final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) {
super(NUM_PATHS_PER_GRAPH);
this.minKmer = minKmer;
this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
}
@Override
protected List<AssemblyResult> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
final List<AssemblyResult> results = new LinkedList<>();
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
if( maxKmer < minKmer) {
// Reads are too small for assembly so don't try to create any assembly graphs
return Collections.emptyList();
}
// create the graph for each possible kmer
for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) {
if ( debugGraphTransformations && kmer > onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms)
continue;
if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
DeBruijnGraph graph = createGraphFromSequences(reads, kmer, refHaplotype, activeAlleleHaplotypes);
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
if ( shouldErrorCorrectKmers() ) {
throw new UserException("Error correction no longer supported because of the " +
"incredibly naive way this was implemented. The command line argument remains because some" +
" future subsystem will actually go and error correct the reads");
}
results.add(cleanupSeqGraph(graph.convertToSequenceGraph()));
if ( debugGraphTransformations ) // we only want to use one graph size
break;
}
}
return results;
}
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph);
// First pull kmers from the reference haplotype and add them to the graph
if ( ! addReferenceKmersToGraph(builder, refHaplotype.getBases()) )
// something went wrong, so abort right now with a null graph
return null;
// add the artificial GGA haplotypes to the graph
if ( ! addGGAKmersToGraph(builder, activeAlleleHaplotypes) )
// something went wrong, so abort right now with a null graph
return null;
// now go through the graph already seeded with the reference sequence and add the read kmers to it
if ( ! addReadKmersToGraph(builder, reads) )
// some problem was detected adding the reads to the graph, return null to indicate we failed
return null;
graph.cleanNonRefPaths();
return graph;
}
/**
* Add the high-quality kmers from the artificial GGA haplotypes to the graph
*
* @param builder a debruijn graph builder to add the read kmers to
* @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
*/
protected boolean addGGAKmersToGraph(final DeBruijnGraphBuilder builder, final List<Haplotype> activeAlleleHaplotypes) {
final int kmerLength = builder.getKmerSize();
for( final Haplotype haplotype : activeAlleleHaplotypes ) {
final int end = haplotype.length() - kmerLength;
for( int start = 0; start < end; start++ ) {
builder.addKmerPairFromSeqToGraph( haplotype.getBases(), start, GGA_MODE_ARTIFICIAL_COUNTS );
}
}
// always returns true now, but it's possible that we'd add kmers and decide we don't like the graph in some way
return true;
}
/**
* Add the high-quality kmers from the reads to the graph
*
* @param builder a debruijn graph builder to add the read kmers to
* @param reads a non-null list of reads whose kmers we want to add to the graph
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
*/
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads) {
final int kmerLength = builder.getKmerSize();
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
final byte[] sequence = read.getReadBases();
final byte[] qualities = read.getBaseQualities();
final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
if ( sequence.length > kmerLength + KMER_OVERLAP ) {
int lastGood = -1; // the index of the last good base we've seen
for( int end = 0; end < sequence.length; end++ ) {
if ( qualities[end] < minBaseQualityToUseInAssembly ) {
lastGood = -1; // reset the last good base
} else if ( lastGood == -1 ) {
lastGood = end; // we're at a good base, the last good one is us
} else if ( end - kmerLength >= lastGood ) {
// end - kmerLength (the start) is after the lastGood base, so that kmer is good
final int start = end - kmerLength;
// how many observations of this kmer have we seen? A normal read counts for 1, but
// a reduced read might imply a higher multiplicity for our the edge
int countNumber = 1;
if ( read.isReducedRead() ) {
// compute mean number of reduced read counts in current kmer span
// precise rounding can make a difference with low consensus counts
// TODO -- optimization: should extend arrayMax function to take start stop values
countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, start, end));
}
builder.addKmerPairFromSeqToGraph(sequence, start, countNumber);
}
}
}
}
builder.flushKmersToGraph(false);
// always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way
return true;
}
/**
* Add the kmers from the reference sequence to the DeBruijnGraph
*
* @param builder the graph to add the reference kmers to. Must be empty
* @param refSequence the reference sequence from which we'll get our kmers
* @return true if we succeeded in creating a good graph from the reference sequence, false otherwise
*/
protected boolean addReferenceKmersToGraph(final DeBruijnGraphBuilder builder, final byte[] refSequence) {
if ( builder == null ) throw new IllegalArgumentException("graph cannot be null");
if ( builder.getGraph().vertexSet().size() != 0 )
throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + builder.getGraph().vertexSet().size() + " vertices in it already: " + builder.getGraph());
if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null");
final int kmerLength = builder.getKmerSize();
if( refSequence.length < kmerLength + KMER_OVERLAP ) {
// not enough reference sequence to build a kmer graph of this length, return null
return false;
}
final int kmersInSequence = refSequence.length - kmerLength + 1;
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
builder.addKmerPairFromSeqToGraph(refSequence, iii, 1);
}
builder.flushKmersToGraph(true);
// we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices
if ( builder.getGraph().vertexSet().size() != kmersInSequence ) {
if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping");
return false;
}
return true;
}
@Override
public String toString() {
return "DeBruijnAssembler{" +
"minKmer=" + minKmer +
'}';
}
}

View File

@ -1,150 +0,0 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
/**
* Fast approach to building a DeBruijnGraph
*
* Follows the model:
*
* for each X that has bases for the final graph:
* addKmer pair (single kmer with kmer size + 1 spanning the pair)
*
* flushKmersToGraph
*
* User: depristo
* Date: 4/7/13
* Time: 4:14 PM
*/
public class DeBruijnGraphBuilder {
/** The size of the kmer graph we want to build */
private final int kmerSize;
/** The graph we're going to add kmers to */
private final DeBruijnGraph graph;
/** keeps counts of all kmer pairs added since the last flush */
private final KMerCounter counter;
/**
* Create a new builder that will write out kmers to graph
*
* @param graph a non-null graph that can contain already added kmers
*/
public DeBruijnGraphBuilder(final DeBruijnGraph graph) {
if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null");
this.kmerSize = graph.getKmerSize();
this.graph = graph;
this.counter = new KMerCounter(kmerSize + 1);
}
/**
* The graph we're building
* @return a non-null graph
*/
public DeBruijnGraph getGraph() {
return graph;
}
/**
* The kmer size of our graph
* @return positive integer
*/
public int getKmerSize() {
return kmerSize;
}
/**
* Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this
* graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The
* edge between them is added with isRef and multiplicity
*
* @param sequence a sequence of bases from which we want to extract a pair of kmers
* @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize()
* @param multiplicity what's the multiplicity of the edge between these two kmers
*/
public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final int multiplicity ) {
if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null");
if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start);
if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length);
final Kmer kmerPair = new Kmer(sequence, start, getKmerSize() + 1);
addKmerPair(kmerPair, multiplicity);
}
/**
* Add a single kmer pair to this builder
* @param kmerPair a kmer pair is a single kmer that has kmerSize + 1 bp, where 0 -> kmersize and 1 -> kmersize + 1
* will have an edge added to this
* @param multiplicity the desired multiplicity of this edge
*/
public void addKmerPair(final Kmer kmerPair, final int multiplicity) {
if ( kmerPair.length() != kmerSize + 1 ) throw new IllegalArgumentException("kmer pair must be of length kmerSize + 1 = " + kmerSize + 1 + " but got " + kmerPair.length());
counter.addKmer(kmerPair, multiplicity);
}
/**
* Flushes the currently added kmers to the graph
*
* After this function is called the builder is reset to an empty state
*
* This flushing is expensive, so many kmers should be added to the builder before flushing. The most
* efficient workflow is to add all of the kmers of a particular class (all ref bases, or all read bases)
* then and do one flush when completed
*
* @param addRefEdges should the kmers present in the builder be added to the graph with isRef = true for the edges?
*/
public void flushKmersToGraph(final boolean addRefEdges) {
for ( final KMerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) {
final byte[] first = countedKmer.getKmer().subKmer(0, kmerSize).bases();
final byte[] second = countedKmer.getKmer().subKmer(1, kmerSize).bases();
graph.addKmersToGraph(first, second, addRefEdges, countedKmer.getCount());
}
counter.clear();
}
}

View File

@ -77,6 +77,8 @@ import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.gvcf.GVCFWriter;
import org.broadinstitute.sting.utils.haplotype.*;
@ -236,22 +238,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@ArgumentCollection
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
// -----------------------------------------------------------------------------------------------
// arguments to control internal behavior of the debruijn assembler
// -----------------------------------------------------------------------------------------------
@Advanced
@Argument(fullName="useDebruijnAssembler", shortName="useDebruijnAssembler", doc="If specified, we will use the old DeBruijn assembler. Depreciated as of 2.6", required = false)
protected boolean useDebruijnAssembler = false;
@Advanced
@Argument(fullName="minKmerForDebruijnAssembler", shortName="minKmerForDebruijnAssembler", doc="Minimum kmer length to use in the debruijn assembly graph", required = false)
protected int minKmerForDebruijnAssembler = 11;
@Advanced
@Argument(fullName="onlyUseKmerSizeForDebruijnAssembler", shortName="onlyUseKmerSizeForDebruijnAssembler", doc="If specified, we will only build kmer graphs with this kmer size in the debruijn", required = false)
protected int onlyUseKmerSizeForDebruijnAssembler = -1;
// -----------------------------------------------------------------------------------------------
// arguments to control internal behavior of the read threading assembler
// -----------------------------------------------------------------------------------------------
@ -349,7 +335,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// -----------------------------------------------------------------------------------------------
@Advanced
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 2;
@Advanced
@ -621,9 +607,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// create and setup the assembler
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
assemblyEngine = useDebruijnAssembler
? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples);
assemblyEngine = new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples);
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
@ -870,7 +854,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
*/
protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List<VariantContext> activeAllelesToGenotype) {
// Create the reference haplotype which is the bases from the reference that make up the active region
finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails
finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
@ -1097,8 +1081,15 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
}
// TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code.
final List<GATKSAMRecord> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
// handle overlapping read pairs from the same fragment
cleanOverlappingReadPairs(downsampledReads);
activeRegion.clearReads();
activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
activeRegion.addAll(downsampledReads);
}
private Set<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
@ -1138,7 +1129,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
return returnMap;
}
/**
* Are we emitting a reference confidence in some form, or not?
* @return true if we are
@ -1146,4 +1136,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private boolean emitReferenceConfidence(){
return emitReferenceConfidence != ReferenceConfidenceMode.NONE;
}
/**
* Clean up reads/bases that overlap within read pairs
*
* @param reads the list of reads to consider
*/
private void cleanOverlappingReadPairs(final List<GATKSAMRecord> reads) {
for ( final List<GATKSAMRecord> perSampleReadList : splitReadsBySample(reads).values() ) {
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create(perSampleReadList);
for ( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() )
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair);
}
}
}

View File

@ -77,7 +77,7 @@ public abstract class LocalAssemblyEngine {
* If false, we will only write out a region around the reference source
*/
private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true;
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 10;
private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30;
protected final int numBestHaplotypesPerGraph;
@ -301,9 +301,7 @@ public abstract class LocalAssemblyEngine {
printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot"));
// now go through and prune the graph, removing vertices no longer connected to the reference chain
// IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
// edges to maintain graph connectivity.
seqGraph.pruneGraph(pruneFactor);
seqGraph.removeSingletonOrphanVertices();
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot"));

View File

@ -71,7 +71,7 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
private final int kmerSize;
/**
* Construct a DeBruijnGraph with kmerSize
* Construct a TestGraph with kmerSize
* @param kmerSize
*/
public BaseGraph(final int kmerSize, final EdgeFactory<V,E> edgeFactory) {
@ -472,28 +472,11 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
}
/**
* Prune all edges from this graph that have multiplicity <= pruneFactor and remove all orphaned singleton vertices as well
*
* @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
*/
public void pruneGraph( final int pruneFactor ) {
final List<E> edgesToRemove = new ArrayList<>();
for( final E e : edgeSet() ) {
if( e.getPruningMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
edgesToRemove.add(e);
}
}
removeAllEdges(edgesToRemove);
removeSingletonOrphanVertices();
}
/**
* Prune all chains from this graph where all edges in the path have multiplicity <= pruneFactor
* Prune all chains from this graph where any edge in the path has multiplicity < pruneFactor
*
* @see LowWeightChainPruner for more information
*
* @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
* @param pruneFactor all edges with multiplicity < this factor that aren't ref edges will be removed
*/
public void pruneLowWeightChains( final int pruneFactor ) {
final LowWeightChainPruner<V,E> pruner = new LowWeightChainPruner<>(pruneFactor);
@ -503,7 +486,7 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
/**
* Remove all vertices in the graph that have in and out degree of 0
*/
protected void removeSingletonOrphanVertices() {
public void removeSingletonOrphanVertices() {
// Run through the graph and clean up singular orphaned nodes
final List<V> verticesToRemove = new LinkedList<>();
for( final V v : vertexSet() ) {

View File

@ -96,7 +96,7 @@ public class LowWeightChainPruner<V extends BaseVertex, E extends BaseEdge> {
}
/**
* Traverse the edges in the path and determine if any are either ref edges or have weight above
* Traverse the edges in the path and determine if any are either ref edges or have weight above or equal to
* the pruning factor and should therefore not be pruned away.
*
* @param path the path in question

View File

@ -49,17 +49,16 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import com.google.java.contract.Ensures;
import org.jgrapht.EdgeFactory;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* A DeBruijn kmer graph
* A Test kmer graph
*
* User: rpoplin
* Date: 2/6/13
*/
public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex, BaseEdge> {
public final class TestGraph extends BaseGraph<DeBruijnVertex, BaseEdge> {
/**
* Edge factory that creates non-reference multiplicity 1 edges
*/
@ -71,33 +70,20 @@ public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex, BaseEdge> {
}
/**
* Create an empty DeBruijnGraph with default kmer size
* Create an empty TestGraph with default kmer size
*/
public DeBruijnGraph() {
public TestGraph() {
this(11);
}
/**
* Create an empty DeBruijnGraph with kmer size
* Create an empty TestGraph with kmer size
* @param kmerSize kmer size, must be >= 1
*/
public DeBruijnGraph(int kmerSize) {
public TestGraph(int kmerSize) {
super(kmerSize, new MyEdgeFactory());
}
/**
* Pull kmers out of the given long sequence and throw them on in the graph
* @param sequence byte array holding the sequence with which to build the assembly graph
* @param KMER_LENGTH the desired kmer length to use
* @param isRef if true the kmers added to the graph will have reference edges linking them
*/
public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) {
if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); }
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1);
}
}
/**
* Add edge to assembly graph connecting the two kmers
@ -129,7 +115,7 @@ public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex, BaseEdge> {
@Ensures({"result != null"})
public SeqGraph convertToSequenceGraph() {
final SeqGraph seqGraph = new SeqGraph(getKmerSize());
final Map<DeBruijnVertex, SeqVertex> vertexMap = new HashMap<DeBruijnVertex, SeqVertex>();
final Map<DeBruijnVertex, SeqVertex> vertexMap = new HashMap<>();
// create all of the equivalent seq graph vertices
for ( final DeBruijnVertex dv : vertexSet() ) {

View File

@ -190,7 +190,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
// we can recover them by merging some N bases from the chain back into the reference
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(pruneFactor);
// remove all heading and trailing paths
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();

View File

@ -58,6 +58,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman;
import org.jgrapht.EdgeFactory;
import org.jgrapht.alg.CycleDetector;
@ -93,6 +94,9 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
private final static boolean WRITE_GRAPH = false;
private final static boolean DEBUG_NON_UNIQUE_CALC = false;
private final static int MAX_CIGAR_COMPLEXITY = 3;
private final static int MIN_DANGLING_TAIL_LENGTH = 5; // SNP + 3 stabilizing nodes + the LCA
/** for debugging info printing */
private static int counter = 0;
@ -276,13 +280,14 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
* Attempt to attach vertex with out-degree == 0 to the graph
*
* @param vertex the vertex to recover
* @param pruneFactor the prune factor to use in ignoring chain pieces
* @return 1 if we successfully recovered the vertex and 0 otherwise
*/
protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) {
protected int recoverDanglingChain(final MultiDeBruijnVertex vertex, final int pruneFactor) {
if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0");
// generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex);
final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex, pruneFactor);
// if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path
if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) )
@ -301,13 +306,14 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
protected boolean cigarIsOkayToMerge(final Cigar cigar) {
final List<CigarElement> elements = cigar.getCigarElements();
final int numElements = elements.size();
// don't allow more than a couple of different ops
if ( elements.size() > 3 )
if ( numElements > MAX_CIGAR_COMPLEXITY )
return false;
// the last element must be an M
if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M )
if ( elements.get(numElements - 1).getOperator() != CigarOperator.M )
return false;
// TODO -- do we want to check whether the Ms mismatch too much also?
@ -334,7 +340,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
return 0;
final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0);
final int refIndexToMerge = lastRefIndex - matchingSuffix + 1;
final boolean firstElementIsDeletion = elements.get(0).getOperator() == CigarOperator.D;
final int refIndexToMerge = lastRefIndex - matchingSuffix + 1 + (firstElementIsDeletion ? 1 : 0); // need to push down if SW tells us to remove the LCA
addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1));
return 1;
}
@ -344,13 +351,14 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
* provided vertex is the sink) and the reference path.
*
* @param vertex the sink of the dangling tail
* @param pruneFactor the prune factor to use in ignoring chain pieces
* @return a SmithWaterman object which can be null if no proper alignment could be generated
*/
protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex) {
protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor) {
// find the lowest common ancestor path between vertex and the reference sink if available
final List<MultiDeBruijnVertex> altPath = findPathToLowestCommonAncestorOfReference(vertex);
if ( altPath == null || isRefSource(altPath.get(0)) )
final List<MultiDeBruijnVertex> altPath = findPathToLowestCommonAncestorOfReference(vertex, pruneFactor);
if ( altPath == null || isRefSource(altPath.get(0)) || altPath.size() < MIN_DANGLING_TAIL_LENGTH )
return null;
// now get the reference path from the LCA
@ -361,24 +369,32 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
final byte[] altBases = getBasesForPath(altPath);
// run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting)
final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWPairwiseAlignment.OVERHANG_STRATEGY.INDEL);
final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWParameterSet.STANDARD_NGS, SWPairwiseAlignment.OVERHANG_STRATEGY.LEADING_INDEL);
return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar()));
}
/**
* Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex
* Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex.
* Note that nodes are excluded if their pruning weight is less than the pruning factor.
*
* @param vertex the original vertex
* @param pruneFactor the prune factor to use in ignoring chain pieces
* @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
* has an ancestor with multiple incoming edges before hitting the reference path
*/
protected List<MultiDeBruijnVertex> findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) {
protected List<MultiDeBruijnVertex> findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) {
final LinkedList<MultiDeBruijnVertex> path = new LinkedList<>();
MultiDeBruijnVertex v = vertex;
while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) {
path.addFirst(v);
v = getEdgeSource(incomingEdgeOf(v));
final MultiSampleEdge edge = incomingEdgeOf(v);
// if it has too low a weight, don't use it (or previous vertexes) for the path
if ( edge.getPruningMultiplicity() < pruneFactor )
path.clear();
// otherwise it is safe to use
else
path.addFirst(v);
v = getEdgeSource(edge);
}
path.addFirst(v);
@ -453,7 +469,12 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
return nonUniqueKmers.size() * 4 > uniqueKmers.size();
}
public void recoverDanglingTails() {
/**
* Try to recover dangling tails
*
* @param pruneFactor the prune factor to use in ignoring chain pieces
*/
public void recoverDanglingTails(final int pruneFactor) {
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
int attempted = 0;
@ -461,7 +482,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
for ( final MultiDeBruijnVertex v : vertexSet() ) {
if ( outDegreeOf(v) == 0 && ! isRefNodeAndRefSink(v) ) {
attempted++;
nRecovered += recoverDanglingChain(v);
nRecovered += recoverDanglingChain(v, pruneFactor);
}
}
@ -740,13 +761,12 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
// the first good base is at lastGood, can be -1 if last base was bad
final int start = lastGood;
// the stop base is end - 1 (if we're not at the end of the sequence)
final int stop = end == sequence.length ? sequence.length : end;
final int len = stop - start + 1;
final int len = end - start;
if ( start != -1 && len >= kmerSize ) {
// if the sequence is long enough to get some value out of, add it to the graph
final String name = read.getReadName() + "_" + start + "_" + end;
addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, stop, reducedReadCounts, false);
addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, reducedReadCounts, false);
}
lastGood = -1; // reset the last good base

View File

@ -59,6 +59,9 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
protected static final double INITIAL_CONDITION = Math.pow(2, 1020);
protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION);
// we divide e by 3 because the observed base could have come from any of the non-observed alleles
protected static final double TRISTATE_CORRECTION = 3.0;
private static final int matchToMatch = 0;
private static final int indelToMatch = 1;
private static final int matchToInsertion = 2;
@ -146,7 +149,7 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) );
}
}
}

View File

@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "98f4d78aad745c6e853b81b2e4e207b4");
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "4dd1b38f0389e339ce8a05956956aa8a");
}
}

View File

@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","25902d7a6a0c00c60c2d5845dfaa1a4c");
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","39f559996f8d429839c585bbab68dbde");
}
@Test(enabled = true)

View File

@ -56,8 +56,8 @@ import java.util.List;
public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
private final static String baseCommandIndels = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
private final static String baseCommandIndels = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
// --------------------------------------------------------------------------------------------------------------
//
@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("ef8151aa699da3272c1ae0986d16ca21"));
Arrays.asList("3c8727ee6e2a6f10ab728c4869dd5b92"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -88,7 +88,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("7f88229ccefb74513efb199b61183cb8"));
Arrays.asList("0cbe889e03bab6512680ecaebd52c536"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("1928ad48bcd0ca180e046bc235cfb3f4"));
Arrays.asList("c6f0fa039ca5672469838bc9f52c72d3"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("6663e434a7b549f23bfd52db90e53a1a"));
Arrays.asList("475f8148123792064130faf9f9030fec"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("581c552664e536df6d0f102fb0d10e5a"));
Arrays.asList("a7e4e1bd128424d46cffdd538b220074"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("5596851d19582dd1af3901b7d703ae0a"));
Arrays.asList("8682738c2c66b502cdbf7db466a5c3e2"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("862d82c8aa35f1da4f9e67b5b48dfe52"));
Arrays.asList("d3721bee5edaa31fdd35edd7aa75feb3"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("8d9fc96be07db791737ac18135de4d63"));
Arrays.asList("a5b6d7b32953500d936d3dff512a6254"));
executeTest("test minIndelFraction 0.25", spec);
}

View File

@ -64,8 +64,8 @@ import java.util.Collections;
public class UnifiedGenotyperIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
// --------------------------------------------------------------------------------------------------------------
//
@ -85,7 +85,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("4aa226c00a242047cf427d0919003048"));
Arrays.asList("bc8a4e4ceb46776169b47146805c882a"));
executeTest("test SLOD", spec);
}
@ -101,7 +101,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testCompTrack() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("50937942e3d228614d2531c3be237709"));
Arrays.asList("21185d9a7519356ba672757f5a522971"));
executeTest("test using comp track", spec);
}
@ -175,12 +175,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "3b66f82dbb746875638e076bf51a1583" );
testHeterozosity( 0.01, "2f3051caa785c7c1e2a8b23fa4da90b1" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "714c1795334c7c62c046a75479381ae6" );
testHeterozosity( 1.0 / 1850, "228df9e38580d8ffe1134da7449fa35e" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -196,7 +196,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "6f79205f7ed8006470f056f6805db6c8";
private final static String COMPRESSED_OUTPUT_MD5 = "eebec02fdde9937bffaf44902ace6207";
@Test
public void testCompressedOutput() {
@ -217,24 +217,25 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "d408b4661b820ed86272415b8ea08780";
String md5 = "1f3fad09a63269c36e871e7ee04ebfaa";
final String myCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
Arrays.asList(md5));
executeTest("test parallelization (single thread)", spec1);
GenomeAnalysisEngine.resetRandomGenerator();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1,
myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1,
Arrays.asList(md5));
executeTest("test parallelization (2 threads)", spec2);
GenomeAnalysisEngine.resetRandomGenerator();
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1,
myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1,
Arrays.asList(md5));
executeTest("test parallelization (4 threads)", spec3);
}
@ -252,7 +253,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("31be725b2a7c15e9769391ad940c0587"));
Arrays.asList("9f4e663e3b156b14fd55df3f5f0336a5"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -271,7 +272,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("dcc5cec42730567982def16da4a7f286"));
Arrays.asList("260bb73e2900334d5c3ff8123be0d2d8"));
executeTest(String.format("test calling with BAQ"), spec);
}

View File

@ -53,7 +53,7 @@ import java.util.Arrays;
public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
// --------------------------------------------------------------------------------------------------------------
//
@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("a9466c1e3ce1fc4bac83086b25a6df54"));
Arrays.asList("7f26ca78e550afa28df11d593c90ec9a"));
executeTest("test MultiSample Pilot1", spec);
}
@ -88,22 +88,22 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("aaadb2a355d87344eabb6ac4495a11e4"));
Arrays.asList("02b521fe88a6606a29c12c0885c3debd"));
executeTest("test SingleSample Pilot2", spec);
}
@Test
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("06c85e8eab08b67244cf38fc785aca22"));
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("dd5ad3beaa75319bb2ef1434d2dd9f73"));
executeTest("test Multiple SNP alleles", spec);
}
@Test
public void testBadRead() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
Arrays.asList("d915535c1458733f09f82670092fcab6"));
executeTest("test bad read", spec);
}
@ -111,16 +111,16 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
@Test
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("f3da1ff1e49a831af055ca52d6d07dd7"));
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("a973298b2801b80057bea88507e2858d"));
executeTest("test reverse trim", spec);
}
@Test
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("20ff311f363c51b7385a76f6f296759c"));
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("8d91d98c4e79897690d3c6918b6ac761"));
executeTest("test mismatched PLs", spec);
}
}

View File

@ -62,7 +62,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("ffde0d5e23523e4bd9e7e18f62d37d0f"));
executeTest("test calling on a ReducedRead BAM", spec);
}
@ -74,13 +74,13 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test
public void testReducedBamINDELs() {
testReducedCalling("INDEL", "4b4902327fb132f9aaab3dd5ace934e1");
testReducedCalling("INDEL", "942930038cf7fc9a80b969461aaa9aa6");
}
private void testReducedCalling(final String model, final String md5) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1,
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1,
Arrays.asList(md5));
executeTest("test calling on a ReducedRead BAM with " + model, spec);
}

View File

@ -1,199 +0,0 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/27/12
*/
import net.sf.samtools.*;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class DeBruijnAssemblerUnitTest extends BaseTest {
private final static boolean DEBUG = false;
@Test(enabled = !DEBUG)
public void testReferenceCycleGraph() {
String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC";
String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC";
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true), Collections.<Haplotype>emptyList());
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true), Collections.<Haplotype>emptyList());
Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation.");
Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
}
private static class MockBuilder extends DeBruijnGraphBuilder {
public final List<Kmer> addedPairs = new LinkedList<Kmer>();
private MockBuilder(final int kmerSize) {
super(new DeBruijnGraph(kmerSize));
}
@Override
public void addKmerPair(Kmer kmerPair, int multiplicity) {
logger.info("addKmerPair" + kmerPair);
addedPairs.add(kmerPair);
}
@Override
public void flushKmersToGraph(boolean addRefEdges) {
// do nothing
}
}
@DataProvider(name = "AddReadKmersToGraph")
public Object[][] makeAddReadKmersToGraphData() {
List<Object[]> tests = new ArrayList<Object[]>();
// this functionality can be adapted to provide input data for whatever you might want in your data
final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
final int readLen = bases.length();
final List<Integer> allBadStarts = new ArrayList<Integer>(readLen);
for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i);
for ( final int kmerSize : Arrays.asList(3, 4, 5) ) {
for ( final int nBadQuals : Arrays.asList(0, 1, 2) ) {
for ( final List<Integer> badStarts : Utils.makePermutations(allBadStarts, nBadQuals, false) ) {
tests.add(new Object[]{bases, kmerSize, badStarts});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "AddReadKmersToGraph", enabled = ! DEBUG)
public void testAddReadKmersToGraph(final String bases, final int kmerSize, final List<Integer> badQualsSites) {
final int readLen = bases.length();
final DeBruijnAssembler assembler = new DeBruijnAssembler();
final MockBuilder builder = new MockBuilder(kmerSize);
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
final byte[] quals = Utils.dupBytes((byte)20, bases.length());
for ( final int badSite : badQualsSites ) quals[badSite] = 0;
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLen);
read.setReadBases(bases.getBytes());
read.setBaseQualities(quals);
final Set<String> expectedBases = new HashSet<String>();
final Set<Integer> expectedStarts = new LinkedHashSet<Integer>();
for ( int i = 0; i < readLen; i++) {
boolean good = true;
for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing
good &= i + j < readLen && quals[i+j] >= assembler.getMinBaseQualityToUseInAssembly();
}
if ( good ) {
expectedStarts.add(i);
expectedBases.add(bases.substring(i, i + kmerSize + 1));
}
}
assembler.addReadKmersToGraph(builder, Arrays.asList(read));
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
for ( final Kmer addedKmer : builder.addedPairs ) {
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
}
}
@DataProvider(name = "AddGGAKmersToGraph")
public Object[][] makeAddGGAKmersToGraphData() {
List<Object[]> tests = new ArrayList<Object[]>();
// this functionality can be adapted to provide input data for whatever you might want in your data
final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
final int readLen = bases.length();
final List<Integer> allBadStarts = new ArrayList<Integer>(readLen);
for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i);
for ( final int kmerSize : Arrays.asList(3, 4, 5) ) {
tests.add(new Object[]{bases, kmerSize});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "AddGGAKmersToGraph", enabled = ! DEBUG)
public void testAddGGAKmersToGraph(final String bases, final int kmerSize) {
final int readLen = bases.length();
final DeBruijnAssembler assembler = new DeBruijnAssembler();
final MockBuilder builder = new MockBuilder(kmerSize);
final Set<String> expectedBases = new HashSet<String>();
final Set<Integer> expectedStarts = new LinkedHashSet<Integer>();
for ( int i = 0; i < readLen; i++) {
boolean good = true;
for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing
good &= i + j < readLen;
}
if ( good ) {
expectedStarts.add(i);
expectedBases.add(bases.substring(i, i + kmerSize + 1));
}
}
assembler.addGGAKmersToGraph(builder, Arrays.asList(new Haplotype(bases.getBytes())));
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
for ( final Kmer addedKmer : builder.addedPairs ) {
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
}
}
}

View File

@ -1,124 +0,0 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 2/8/13
*/
public class DeBruijnAssemblyGraphUnitTest {
private class GetReferenceBytesTestProvider extends BaseTest.TestDataProvider {
public byte[] refSequence;
public byte[] altSequence;
public int KMER_LENGTH;
public GetReferenceBytesTestProvider(String ref, String alt, int kmer) {
super(GetReferenceBytesTestProvider.class, String.format("Testing reference bytes. kmer = %d, ref = %s, alt = %s", kmer, ref, alt));
refSequence = ref.getBytes();
altSequence = alt.getBytes();
KMER_LENGTH = kmer;
}
public byte[] expectedReferenceBytes() {
return refSequence;
}
public byte[] calculatedReferenceBytes() {
DeBruijnGraph graph = new DeBruijnGraph();
graph.addSequenceToGraph(refSequence, KMER_LENGTH, true);
if( altSequence.length > 0 ) {
graph.addSequenceToGraph(altSequence, KMER_LENGTH, false);
}
return graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true);
}
}
@DataProvider(name = "GetReferenceBytesTestProvider")
public Object[][] GetReferenceBytesTests() {
new GetReferenceBytesTestProvider("GGTTAACC", "", 3);
new GetReferenceBytesTestProvider("GGTTAACC", "", 4);
new GetReferenceBytesTestProvider("GGTTAACC", "", 5);
new GetReferenceBytesTestProvider("GGTTAACC", "", 6);
new GetReferenceBytesTestProvider("GGTTAACC", "", 7);
new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "", 6);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "", 66);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "", 76);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 3);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 4);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 5);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 6);
new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 7);
new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 3);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 4);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 5);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 6);
new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 7);
new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "AAAAAAAAAAAAA", 6);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 66);
new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 76);
return GetReferenceBytesTestProvider.getTests(GetReferenceBytesTestProvider.class);
}
@Test(dataProvider = "GetReferenceBytesTestProvider", enabled = true)
public void testGetReferenceBytes(GetReferenceBytesTestProvider cfg) {
Assert.assertEquals(cfg.calculatedReferenceBytes(), cfg.expectedReferenceBytes(), "Reference sequences do not match");
}
}

View File

@ -57,14 +57,14 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal
public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest {
private void HCTestComplexVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
}
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "12ed9d67139e7a94d67e9e6c06ac6e16");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "df7be117bd3d256c4a5fbde925ecd19b");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"b7a01525c00d02b3373513a668a43c6a");
"b787be740423b950f8529ccc838fabdd");
}
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"a2a42055b068334f415efb07d6bb9acd");
"8e6a2002c59eafb78bdbf1db9660164b");
}
}

View File

@ -63,12 +63,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "2b54e4e948144030a829175bcd295e47"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ba1bb72caa06c1962a202b2012c266cb"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a841d9e94fb832066a04f13bdc62b101"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "6cc95c47368a568fb9e1eb8578f96b0b"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2703f1c0c27b3c977689604b5f78b61f"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b54e36bbb4dc6c3b786349fa267d1f6c"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "55faaae5617857e2b29848367999aa3e"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "e32b7fc4de29ed141dcafc0d789d5ed6"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "ecac86e8ef4856e6dfa306c436e9b545"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "7828256b82df377cc3a26a55dbf68f91"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "e41e0acf172a994e938a150390badd39"});
return tests.toArray(new Object[][]{});

View File

@ -71,19 +71,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
private void HCTest(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCaller: args=" + args, spec);
}
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "baa5a2eedc8f06ce9f8f98411ee09f8a");
HCTest(CEUTRIO_BAM, "", "c0b1b64c6005cd3640ffde5dbc10174b");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "f09e03d41238697b23f95716a12667cb");
HCTest(NA12878_BAM, "", "439ce9024f04aad08eab1526d887e295");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -94,7 +94,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"130d36448faeb7b8d4bce4be12dacd3a");
"b09437f11db40abd49195110e50692c2");
}
@Test
@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "7c20aa62633f4ce8ebf12950fbf05ec0");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c57c463542304fb7b2576e531faca89e");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -147,7 +147,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerNearbySmallIntervals() {
HCTestNearbySmallIntervals(NA12878_BAM, "", "0ddc56f0a0fbcfefda79aa20b2ecf603");
HCTestNearbySmallIntervals(NA12878_BAM, "", "75820a4558a559b3e1636fdd1b776ea2");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -157,7 +157,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@ -185,16 +185,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("5fe9310addf881bed4fde2354e59ce34"));
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@Test
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("26a9917f6707536636451266de0116c3"));
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("6a9222905c740b9208bf3c67478514eb"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("c5c63d03e1c4bbe32f06902acd4a10f9"));
Arrays.asList("58a0089e6ebf7cee414adb7a6002d43f"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -217,7 +217,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("f0b2a96040429908cce17327442eec29"));
Arrays.asList("1352cbe1404aefc94eb8e044539a9882"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
}

View File

@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "e800f6bb3a820da5c6b29f0195480796"});
tests.add(new Object[]{nct, "6f8c3cac54eb1460e2c65fe00978b1c1"});
}
return tests.toArray(new Object[][]{});

View File

@ -87,15 +87,6 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
}
private enum Assembler {DEBRUIJN_ASSEMBLER, READ_THREADING_ASSEMBLER}
private LocalAssemblyEngine createAssembler(final Assembler type) {
switch ( type ) {
case DEBRUIJN_ASSEMBLER: return new DeBruijnAssembler();
case READ_THREADING_ASSEMBLER: return new ReadThreadingAssembler();
default: throw new IllegalStateException("Unexpected " + type);
}
}
@DataProvider(name = "AssembleIntervalsData")
public Object[][] makeAssembleIntervalsData() {
List<Object[]> tests = new ArrayList<Object[]>();
@ -107,12 +98,10 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
final int stepSize = 200;
final int nReadsToUse = 5;
for ( final Assembler assembler : Assembler.values() ) {
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
tests.add(new Object[]{assembler, refLoc, nReadsToUse});
}
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
tests.add(new Object[]{new ReadThreadingAssembler(), refLoc, nReadsToUse});
}
return tests.toArray(new Object[][]{});
@ -130,13 +119,11 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
final int variantStepSize = 1;
final int nReadsToUse = 5;
for ( final Assembler assembler : Assembler.values() ) {
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
for ( int variantStart = windowSize / 2 - 10; variantStart < windowSize / 2 + 10; variantStart += variantStepSize ) {
tests.add(new Object[]{assembler, refLoc, nReadsToUse, variantStart});
}
for ( int startI = start; startI < end; startI += stepSize) {
final int endI = startI + windowSize;
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI);
for ( int variantStart = windowSize / 2 - 10; variantStart < windowSize / 2 + 10; variantStart += variantStepSize ) {
tests.add(new Object[]{new ReadThreadingAssembler(), refLoc, nReadsToUse, variantStart});
}
}
@ -144,7 +131,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
@Test(dataProvider = "AssembleIntervalsData")
public void testAssembleRef(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse) {
public void testAssembleRef(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
@ -163,7 +150,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
@Test(dataProvider = "AssembleIntervalsWithVariantData")
public void testAssembleRefAndSNP(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
public void testAssembleRefAndSNP(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
final Allele refBase = Allele.create(refBases[variantSite], true);
final Allele altBase = Allele.create((byte)(refBase.getBases()[0] == 'A' ? 'C' : 'A'), false);
@ -172,7 +159,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
@Test(dataProvider = "AssembleIntervalsWithVariantData")
public void testAssembleRefAndDeletion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
public void testAssembleRefAndDeletion(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
for ( int deletionLength = 1; deletionLength < 10; deletionLength++ ) {
final Allele refBase = Allele.create(new String(refBases).substring(variantSite, variantSite + deletionLength + 1), true);
@ -183,7 +170,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
@Test(dataProvider = "AssembleIntervalsWithVariantData")
public void testAssembleRefAndInsertion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
public void testAssembleRefAndInsertion(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) {
final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
for ( int insertionLength = 1; insertionLength < 10; insertionLength++ ) {
final Allele refBase = Allele.create(refBases[variantSite], false);
@ -193,7 +180,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
}
private void testAssemblyWithVariant(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final int nReadsToUse, final VariantContext site) {
private void testAssemblyWithVariant(final ReadThreadingAssembler assembler, final byte[] refBases, final GenomeLoc loc, final int nReadsToUse, final VariantContext site) {
final String preRef = new String(refBases).substring(0, site.getStart());
final String postRef = new String(refBases).substring(site.getEnd() + 1, refBases.length);
final byte[] altBases = (preRef + site.getAlternateAllele(0).getBaseString() + postRef).getBytes();
@ -217,7 +204,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
}
private List<Haplotype> assemble(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final List<GATKSAMRecord> reads) {
private List<Haplotype> assemble(final ReadThreadingAssembler assembler, final byte[] refBases, final GenomeLoc loc, final List<GATKSAMRecord> reads) {
final Haplotype refHaplotype = new Haplotype(refBases, true);
final Cigar c = new Cigar();
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
@ -225,9 +212,8 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
final ActiveRegion activeRegion = new ActiveRegion(loc, null, true, genomeLocParser, 0);
activeRegion.addAll(reads);
final LocalAssemblyEngine engine = createAssembler(assembler);
// logger.warn("Assembling " + activeRegion + " with " + engine);
return engine.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.<VariantContext>emptyList(), null);
return assembler.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.<VariantContext>emptyList(), null);
}
@DataProvider(name = "SimpleAssemblyTestData")
@ -239,30 +225,25 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
final int windowSize = 200;
final int end = start + windowSize;
final Map<Assembler, Integer> edgeExcludesByAssembler = new EnumMap<>(Assembler.class);
edgeExcludesByAssembler.put(Assembler.DEBRUIJN_ASSEMBLER, 26);
edgeExcludesByAssembler.put(Assembler.READ_THREADING_ASSEMBLER, 25); // TODO -- decrease to zero when the edge calling problem is fixed
final int excludeVariantsWithinXbp = 25; // TODO -- decrease to zero when the edge calling problem is fixed
final String ref = new String(seq.getSubsequenceAt(contig, start, end).getBases());
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, start, end);
for ( final Assembler assembler : Assembler.values() ) {
final int excludeVariantsWithXbp = edgeExcludesByAssembler.get(assembler);
for ( int snpPos = 0; snpPos < windowSize; snpPos++) {
if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) {
if ( snpPos > excludeVariantsWithinXbp && (windowSize - snpPos) >= excludeVariantsWithinXbp ) {
final byte[] altBases = ref.getBytes();
altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A';
final String alt = new String(altBases);
tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt});
tests.add(new Object[]{"SNP at " + snpPos, new ReadThreadingAssembler(), refLoc, ref, alt});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "SimpleAssemblyTestData")
public void testSimpleAssembly(final String name, final Assembler assembler, final GenomeLoc loc, final String ref, final String alt) {
public void testSimpleAssembly(final String name, final ReadThreadingAssembler assembler, final GenomeLoc loc, final String ref, final String alt) {
final byte[] refBases = ref.getBytes();
final byte[] altBases = alt.getBytes();

View File

@ -49,7 +49,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
@ -231,7 +230,7 @@ public class BaseGraphUnitTest extends BaseTest {
final File tmp = File.createTempFile("tmp", "dot");
tmp.deleteOnExit();
new SeqGraph().printGraph(tmp, 10);
new DeBruijnGraph().printGraph(tmp, 10);
new TestGraph().printGraph(tmp, 10);
}
@Test
@ -248,71 +247,6 @@ public class BaseGraphUnitTest extends BaseTest {
Assert.assertEquals(actualSet, expectedSet);
}
@Test(enabled = true)
public void testPruneGraph() {
DeBruijnGraph graph = new DeBruijnGraph();
DeBruijnGraph expectedGraph = new DeBruijnGraph();
DeBruijnVertex v = new DeBruijnVertex("ATGG");
DeBruijnVertex v2 = new DeBruijnVertex("ATGGA");
DeBruijnVertex v3 = new DeBruijnVertex("ATGGT");
DeBruijnVertex v4 = new DeBruijnVertex("ATGGG");
DeBruijnVertex v5 = new DeBruijnVertex("ATGGC");
DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC");
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new BaseEdge(false, 1));
graph.addEdge(v2, v3, new BaseEdge(false, 3));
graph.addEdge(v3, v4, new BaseEdge(false, 5));
graph.addEdge(v4, v5, new BaseEdge(false, 3));
graph.addEdge(v5, v6, new BaseEdge(false, 2));
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3));
expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5));
expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3));
graph.pruneGraph(2);
Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
graph = new DeBruijnGraph();
expectedGraph = new DeBruijnGraph();
graph.addVertex(v);
graph.addVertex(v2);
graph.addVertex(v3);
graph.addVertex(v4);
graph.addVertex(v5);
graph.addVertex(v6);
graph.addEdge(v, v2, new BaseEdge(true, 1));
graph.addEdge(v2, v3, new BaseEdge(false, 3));
graph.addEdge(v3, v4, new BaseEdge(false, 5));
graph.addEdge(v4, v5, new BaseEdge(false, 3));
expectedGraph.addVertex(v);
expectedGraph.addVertex(v2);
expectedGraph.addVertex(v3);
expectedGraph.addVertex(v4);
expectedGraph.addVertex(v5);
expectedGraph.addEdge(v, v2, new BaseEdge(true, 1));
expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3));
expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5));
expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3));
graph.pruneGraph(2);
Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
}
@Test(enabled = true)
public void testGetBases() {
@ -324,7 +258,7 @@ public class BaseGraphUnitTest extends BaseTest {
vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize)));
}
final String result = new String(new DeBruijnGraph().getBasesForPath(vertexes));
final String result = new String(new TestGraph().getBasesForPath(vertexes));
Assert.assertEquals(result, testString.substring(kmerSize - 1));
}
}

View File

@ -72,7 +72,7 @@ public class SeqGraphUnitTest extends BaseTest {
}
public SeqGraph calcGraph() {
final DeBruijnGraph deBruijnGraph = new DeBruijnGraph();
final TestGraph deBruijnGraph = new TestGraph();
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for (int i = 0; i < kmersInSequence - 1; i++) {
// get the kmers

View File

@ -212,8 +212,8 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion
tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp
tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps
tests.add(new Object[]{"AAAAA", "C", "1M4D1M", true, -1}); // funky SW alignment
tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", true, 1}); // very little data
tests.add(new Object[]{"AAAAA", "C", "1M4D1M", false, -1}); // funky SW alignment
tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", false, 1}); // very little data
tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch
tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex
@ -253,7 +253,13 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
Assert.assertTrue(altSink != null, "We did not find a non-reference sink");
// confirm that the SW alignment agrees with our expectations
final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink);
final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink, 0);
if ( result == null ) {
Assert.assertFalse(cigarIsGood);
return;
}
Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString());
// confirm that the goodness of the cigar agrees with our expectations

View File

@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
for ( final int nct : Arrays.asList(1, 2) ) {
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
tests.add(new Object[]{ "BOTH", "aad3a398273ec795e363268997247bd8", nt, nct });
tests.add(new Object[]{ "BOTH", "a80925b58735828158491f77ae64998b", nt, nct });
}
return tests.toArray(new Object[][]{});

View File

@ -56,6 +56,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -68,11 +69,18 @@ public class PairHMMUnitTest extends BaseTest {
private final static boolean ALLOW_READS_LONGER_THAN_HAPLOTYPE = true;
private final static boolean DEBUG = false;
final static boolean EXTENSIVE_TESTING = true;
final PairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation
final PairHMM originalHMM = new Log10PairHMM(false); // the reference implementation
final PairHMM loglessHMM = new LoglessPairHMM();
final N2MemoryPairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation
final N2MemoryPairHMM originalHMM = new Log10PairHMM(false); // the reference implementation
final N2MemoryPairHMM loglessHMM = new LoglessPairHMM();
private List<PairHMM> getHMMs() {
@BeforeClass
public void initialize() {
exactHMM.doNotUseTristateCorrection();
originalHMM.doNotUseTristateCorrection();
loglessHMM.doNotUseTristateCorrection();
}
private List<N2MemoryPairHMM> getHMMs() {
return Arrays.asList(exactHMM, originalHMM, loglessHMM);
}
@ -592,8 +600,13 @@ public class PairHMMUnitTest extends BaseTest {
public Object[][] makeUninitializedHMMs() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{new LoglessPairHMM()});
tests.add(new Object[]{new Log10PairHMM(true)});
final LoglessPairHMM myLoglessPairHMM = new LoglessPairHMM();
myLoglessPairHMM.doNotUseTristateCorrection();
tests.add(new Object[]{myLoglessPairHMM});
final Log10PairHMM myLog10PairHMM = new Log10PairHMM(true);
myLog10PairHMM.doNotUseTristateCorrection();
tests.add(new Object[]{myLog10PairHMM});
return tests.toArray(new Object[][]{});
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.fragments;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.util.QualityUtil;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
@ -60,6 +61,11 @@ import java.util.*;
* Time: 10:09 PM
*/
public final class FragmentUtils {
public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;
public final static int DEFAULT_PCR_ERROR_QUAL = QualityUtil.getPhredScoreFromErrorProbability(DEFAULT_PCR_ERROR_RATE);
public final static int HALF_OF_DEFAULT_PCR_ERROR_QUAL = DEFAULT_PCR_ERROR_QUAL / 2;
protected final static byte MIN_QUAL_BAD_OVERLAP = 16;
private FragmentUtils() {} // private constructor
@ -189,6 +195,70 @@ public final class FragmentUtils {
return create(reads, reads.size(), SamRecordGetter);
}
public static void adjustQualsOfOverlappingPairedFragments( final List<GATKSAMRecord> overlappingPair ) {
if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); }
final GATKSAMRecord firstRead = overlappingPair.get(0);
final GATKSAMRecord secondRead = overlappingPair.get(1);
if ( secondRead.getSoftStart() < firstRead.getSoftStart() ) {
adjustQualsOfOverlappingPairedFragments(secondRead, firstRead);
} else {
adjustQualsOfOverlappingPairedFragments(firstRead, secondRead);
}
}
/**
* Merge two overlapping reads from the same fragment into a single super read, if possible
*
* firstRead and secondRead must be part of the same fragment (though this isn't checked). Looks
* at the bases and alignment, and tries its best to create a meaningful synthetic single super read
* that represents the entire sequenced fragment.
*
* Assumes that firstRead starts before secondRead (according to their soft clipped starts)
*
* @param clippedFirstRead the left most read
* @param clippedSecondRead the right most read
*
* @return a strandless merged read of first and second, or null if the algorithm cannot create a meaningful one
*/
public static void adjustQualsOfOverlappingPairedFragments(final GATKSAMRecord clippedFirstRead, final GATKSAMRecord clippedSecondRead) {
if ( clippedFirstRead == null ) throw new IllegalArgumentException("clippedFirstRead cannot be null");
if ( clippedSecondRead == null ) throw new IllegalArgumentException("clippedSecondRead cannot be null");
if ( ! clippedFirstRead.getReadName().equals(clippedSecondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + clippedFirstRead + " and " + clippedSecondRead);
// don't adjust fragments that do not overlap
if ( clippedFirstRead.getAlignmentEnd() < clippedSecondRead.getAlignmentStart() || clippedFirstRead.getReferenceIndex() != clippedSecondRead.getReferenceIndex() )
return;
final Pair<Integer, Boolean> pair = ReadUtils.getReadCoordinateForReferenceCoordinate(clippedFirstRead, clippedSecondRead.getAlignmentStart());
final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() );
final int numOverlappingBases = Math.min(clippedFirstRead.getReadLength() - firstReadStop, clippedSecondRead.getReadLength());
final byte[] firstReadBases = clippedFirstRead.getReadBases();
final byte[] firstReadQuals = clippedFirstRead.getBaseQualities();
final byte[] secondReadBases = clippedSecondRead.getReadBases();
final byte[] secondReadQuals = clippedSecondRead.getBaseQualities();
for ( int i = 0; i < numOverlappingBases; i++ ) {
final int firstReadIndex = firstReadStop + i;
final byte firstReadBase = firstReadBases[firstReadIndex];
final byte secondReadBase = secondReadBases[i];
if ( firstReadBase == secondReadBase ) {
firstReadQuals[firstReadIndex] = (byte) Math.min(firstReadQuals[firstReadIndex], HALF_OF_DEFAULT_PCR_ERROR_QUAL);
secondReadQuals[i] = (byte) Math.min(secondReadQuals[i], HALF_OF_DEFAULT_PCR_ERROR_QUAL);
} else {
// TODO -- use the proper statistical treatment of the quals from DiploidSNPGenotypeLikelihoods.java
firstReadQuals[firstReadIndex] = 0;
secondReadQuals[i] = 0;
}
}
clippedFirstRead.setBaseQualities(firstReadQuals);
clippedSecondRead.setBaseQualities(secondReadQuals);
}
public static List<GATKSAMRecord> mergeOverlappingPairedFragments( final List<GATKSAMRecord> overlappingPair ) {
if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); }

View File

@ -32,6 +32,8 @@ import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
import static java.lang.Math.log10;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
*
@ -51,6 +53,9 @@ public final class Log10PairHMM extends N2MemoryPairHMM {
private static final int matchToDeletion = 4;
private static final int deletionToDeletion = 5;
// we divide e by 3 because the observed base could have come from any of the non-observed alleles
protected final static double log10_3 = log10(3.0);
/**
* Create an uninitialized PairHMM
*
@ -148,7 +153,7 @@ public final class Log10PairHMM extends N2MemoryPairHMM {
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
QualityUtils.qualToProbLog10(qual) : (QualityUtils.qualToErrorProbLog10(qual) - (doNotUseTristateCorrection ? 0.0 : log10_3)) );
}
}
}

View File

@ -44,6 +44,10 @@ abstract class N2MemoryPairHMM extends PairHMM {
protected double[][] insertionMatrix = null;
protected double[][] deletionMatrix = null;
// only used for debugging purposes
protected boolean doNotUseTristateCorrection = false;
protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; }
/**
* Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths
*

View File

@ -69,10 +69,20 @@ public class SWPairwiseAlignment implements SmithWaterman {
* Add softclips for the overhangs
*/
SOFTCLIP,
/*
* Treat the overhangs as proper insertions/deletions
*/
INDEL,
/*
* Treat the overhangs as proper insertions/deletions for leading (but not trailing) overhangs.
* This is useful e.g. when we want to merge dangling tails in an assembly graph: because we don't
* expect the dangling tail to reach the end of the reference path we are okay ignoring trailing
* deletions - but leading indels are still very much relevant.
*/
LEADING_INDEL,
/*
* Just ignore the overhangs
*/
@ -125,10 +135,11 @@ public class SWPairwiseAlignment implements SmithWaterman {
*
* @param seq1 the first sequence we want to align
* @param seq2 the second sequence we want to align
* @param parameters the SW parameters to use
* @param strategy the overhang strategy to use
*/
public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final OVERHANG_STRATEGY strategy) {
this(SWParameterSet.ORIGINAL_DEFAULT.parameters);
public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final SWParameterSet parameters, final OVERHANG_STRATEGY strategy) {
this(parameters.parameters);
overhang_strategy = strategy;
align(seq1, seq2);
}
@ -226,7 +237,7 @@ public class SWPairwiseAlignment implements SmithWaterman {
final int[] gap_size_h = new int[n+1];
// we need to initialize the SW matrix with gap penalties if we want to keep track of indels at the edges of alignments
if ( overhang_strategy == OVERHANG_STRATEGY.INDEL ) {
if ( overhang_strategy == OVERHANG_STRATEGY.INDEL || overhang_strategy == OVERHANG_STRATEGY.LEADING_INDEL ) {
// initialize the first row
sw[1] = parameters.w_open;
double currentValue = parameters.w_open;
@ -371,7 +382,7 @@ public class SWPairwiseAlignment implements SmithWaterman {
p1 = refLength;
p2 = altLength;
} else {
// look for largest score. we use >= combined with the traversal direction
// look for the largest score on the rightmost column. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1, data_offset = altLength+1+altLength ; i < refLength+1 ; i++, data_offset += (altLength+1) ) {
// data_offset is the offset of [i][m]
@ -380,18 +391,21 @@ public class SWPairwiseAlignment implements SmithWaterman {
}
}
for ( int j = 1, data_offset = refLength*(altLength+1)+1 ; j < altLength+1 ; j++, data_offset++ ) {
// data_offset is the offset of [n][j]
if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(refLength-j) < Math.abs(p1 - p2)) {
p1 = refLength;
p2 = j ;
maxscore = sw[data_offset];
segment_length = altLength - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
// now look for a larger score on the bottom-most row
if ( overhang_strategy != OVERHANG_STRATEGY.LEADING_INDEL ) {
for ( int j = 1, data_offset = refLength*(altLength+1)+1 ; j < altLength+1 ; j++, data_offset++ ) {
// data_offset is the offset of [n][j]
if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(refLength-j) < Math.abs(p1 - p2)) {
p1 = refLength;
p2 = j ;
maxscore = sw[data_offset];
segment_length = altLength - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
}
}
List<CigarElement> lce = new ArrayList<CigarElement>(5);
final List<CigarElement> lce = new ArrayList<CigarElement>(5);
if ( segment_length > 0 && overhang_strategy == OVERHANG_STRATEGY.SOFTCLIP ) {
lce.add(makeElement(State.CLIP, segment_length));
@ -452,7 +466,7 @@ public class SWPairwiseAlignment implements SmithWaterman {
} else if ( overhang_strategy == OVERHANG_STRATEGY.IGNORE ) {
lce.add(makeElement(state, segment_length + p2));
alignment_offset = p1 - p2;
} else { // overhang_strategy == OVERHANG_STRATEGY.INDEL
} else { // overhang_strategy == OVERHANG_STRATEGY.INDEL || overhang_strategy == OVERHANG_STRATEGY.LEADING_INDEL
// take care of the actual alignment
lce.add(makeElement(state, segment_length));

View File

@ -224,7 +224,7 @@ public class FragmentUtilsUnitTest extends BaseTest {
}
@Test(enabled = !DEBUG, dataProvider = "MergeFragmentsTest")
public void testMergingTwoReads(final String name, final GATKSAMRecord read1, GATKSAMRecord read2, final GATKSAMRecord expectedMerged) {
public void testMergingTwoReads(final String name, final GATKSAMRecord read1, final GATKSAMRecord read2, final GATKSAMRecord expectedMerged) {
final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2);
if ( expectedMerged == null ) {
@ -349,4 +349,42 @@ public class FragmentUtilsUnitTest extends BaseTest {
read.setReadGroup(new GATKSAMReadGroupRecord("foo"));
return read;
}
private static final byte highQuality = 30;
private static final byte overlappingQuality = 20;
@DataProvider(name = "AdjustFragmentsTest")
public Object[][] createAdjustFragmentsTest() throws Exception {
List<Object[]> tests = new ArrayList<Object[]>();
final String leftFlank = "CCC";
final String rightFlank = "AAA";
final String allOverlappingBases = "ACGTACGTGGAACCTTAG";
for ( int overlapSize = 1; overlapSize < allOverlappingBases.length(); overlapSize++ ) {
final String overlappingBases = allOverlappingBases.substring(0, overlapSize);
final byte[] overlappingBaseQuals = new byte[overlapSize];
for ( int i = 0; i < overlapSize; i++ ) overlappingBaseQuals[i] = highQuality;
final GATKSAMRecord read1 = makeOverlappingRead(leftFlank, highQuality, overlappingBases, overlappingBaseQuals, "", highQuality, 1);
final GATKSAMRecord read2 = makeOverlappingRead("", highQuality, overlappingBases, overlappingBaseQuals, rightFlank, highQuality, leftFlank.length() + 1);
tests.add(new Object[]{read1, read2, overlapSize});
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = !DEBUG, dataProvider = "AdjustFragmentsTest")
public void testAdjustingTwoReads(final GATKSAMRecord read1, final GATKSAMRecord read2, final int overlapSize) {
FragmentUtils.adjustQualsOfOverlappingPairedFragments(read1, read2);
for ( int i = 0; i < read1.getReadLength() - overlapSize; i++ )
Assert.assertEquals(read1.getBaseQualities()[i], highQuality);
for ( int i = read1.getReadLength() - overlapSize; i < read1.getReadLength(); i++ )
Assert.assertEquals(read1.getBaseQualities()[i], overlappingQuality);
for ( int i = 0; i < overlapSize; i++ )
Assert.assertEquals(read2.getBaseQualities()[i], overlappingQuality);
for ( int i = overlapSize; i < read2.getReadLength(); i++ )
Assert.assertEquals(read2.getBaseQualities()[i], highQuality);
}
}