Merge branch 'master' of github.com:broadinstitute/gsa-unstable
This commit is contained in:
commit
a8ea57df9e
|
|
@ -0,0 +1,3 @@
|
|||
The Genome Analysis Toolkit
|
||||
============
|
||||
See http://www.broadinstitute.org/gatk/
|
||||
23
build.xml
23
build.xml
|
|
@ -887,6 +887,27 @@
|
|||
<fail message="No executable defined. Call a more specific packaging/release target, or define an executable manually" if="no.executable.defined" />
|
||||
</target>
|
||||
|
||||
<target name="require.bcel">
|
||||
<fileset id="bcel.jar" dir="${user.home}/.ant/lib">
|
||||
<include name="bcel-*.jar" />
|
||||
</fileset>
|
||||
<pathconvert refid="bcel.jar" property="bcel.jar.installed" setonempty="false" />
|
||||
|
||||
<fileset id="ant.bcel.jar" dir="${user.home}/.ant/lib">
|
||||
<include name="ant-apache-bcel-*.jar" />
|
||||
</fileset>
|
||||
<pathconvert refid="ant.bcel.jar" property="ant.bcel.jar.installed" setonempty="false" />
|
||||
|
||||
<condition property="bcel.installed">
|
||||
<and>
|
||||
<isset property="bcel.jar.installed" />
|
||||
<isset property="ant.bcel.jar.installed" />
|
||||
</and>
|
||||
</condition>
|
||||
<fail unless="bcel.installed"
|
||||
message="Required bcel libraries for GATK packaging not installed in ${user.home}/.ant/lib/${line.separator}The bcel jar can be found in the lib directory of a GATK clone after compiling, and the ant-apache-bcel jar can be downloaded from here: http://repo1.maven.org/maven2/ant/ant-apache-bcel/1.6.5/ant-apache-bcel-1.6.5.jar${line.separator}Please copy these two jar files to ${user.home}/.ant/lib/" />
|
||||
</target>
|
||||
|
||||
<!-- Unzip all classes from their current locations and assemble them in a staging directory -->
|
||||
<target name="stage" description="stage files for distribution">
|
||||
<mkdir dir="${staging.dir}"/>
|
||||
|
|
@ -910,7 +931,7 @@
|
|||
|
||||
|
||||
<!-- Build a package consisting of all supporting files. Don't call this target directly. Call one of the specific packaging targets below -->
|
||||
<target name="package" depends="require.clean,dist,stage,require.executable" description="bundle up an executable for distribution">
|
||||
<target name="package" depends="require.clean,require.bcel,dist,stage,require.executable" description="bundle up an executable for distribution">
|
||||
<mkdir dir="${package.output.dir}" />
|
||||
<xslt destdir="${package.output.dir}" style="${package.xml.dir}/CreatePackager.xsl" useImplicitFileset="false">
|
||||
<flattenmapper/>
|
||||
|
|
|
|||
|
|
@ -1,22 +0,0 @@
|
|||
Copyright (c) 2012 The Broad Institute
|
||||
|
||||
Permission is hereby granted, free of charge, to any person
|
||||
obtaining a copy of this software and associated documentation
|
||||
files (the "Software"), to deal in the Software without
|
||||
restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
Binary file not shown.
|
|
@ -85,9 +85,6 @@ public class StandardCallerArgumentCollection {
|
|||
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
|
||||
|
||||
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
|
||||
public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
|
||||
|
||||
/**
|
||||
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
|
||||
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
|
||||
|
|
@ -199,7 +196,6 @@ public class StandardCallerArgumentCollection {
|
|||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.INDEL_HETEROZYGOSITY = SCAC.INDEL_HETEROZYGOSITY;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
|
||||
|
|
|
|||
|
|
@ -47,13 +47,11 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -67,41 +65,19 @@ import java.util.*;
|
|||
* <p>The base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*/
|
||||
public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
@Override
|
||||
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }
|
||||
|
||||
@Override
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); }
|
||||
|
||||
protected void fillQualsFromPileup(final List<Allele> allAlleles, final int refLoc,
|
||||
final ReadBackedPileup pileup,
|
||||
final PerReadAlleleLikelihoodMap alleleLikelihoodMap,
|
||||
final List<Double> refQuals, final List<Double> altQuals){
|
||||
|
||||
if (alleleLikelihoodMap == null) {
|
||||
// use fast SNP-based version if we don't have per-read allele likelihoods
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( isUsableBase(p) ) {
|
||||
if ( allAlleles.get(0).equals(Allele.create(p.getBase(),true)) ) {
|
||||
refQuals.add((double)p.getQual());
|
||||
} else if ( allAlleles.contains(Allele.create(p.getBase()))) {
|
||||
altQuals.add((double)p.getQual());
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (Map<Allele,Double> el : alleleLikelihoodMap.getLikelihoodMapValues()) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el);
|
||||
if (! a.isInformative())
|
||||
continue; // read is non-informative
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele()));
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele()));
|
||||
|
||||
|
||||
}
|
||||
@Override
|
||||
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {
|
||||
return (double)read.getBaseQualities()[ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, refLoc, ReadUtils.ClippingTail.RIGHT_TAIL)];
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected Double getElementForPileupElement(final PileupElement p) {
|
||||
return (double)p.getQual();
|
||||
}
|
||||
}
|
||||
|
|
@ -46,14 +46,11 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -69,31 +66,14 @@ import java.util.*;
|
|||
* @since 6/28/12
|
||||
*/
|
||||
public class ClippingRankSumTest extends RankSumTest {
|
||||
|
||||
@Override
|
||||
public List<String> getKeyNames() { return Arrays.asList("ClippingRankSum"); }
|
||||
|
||||
@Override
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ClippingRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); }
|
||||
|
||||
|
||||
protected void fillQualsFromPileup(final List<Allele> allAlleles,
|
||||
final int refLoc,
|
||||
final ReadBackedPileup pileup,
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap, final List<Double> refQuals, final List<Double> altQuals) {
|
||||
// todo - only support non-pileup case for now, e.g. active-region based version
|
||||
if (pileup != null || likelihoodMap == null)
|
||||
return;
|
||||
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (! a.isInformative())
|
||||
continue; // read is non-informative
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey()));
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey()));
|
||||
|
||||
}
|
||||
@Override
|
||||
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {
|
||||
return (double)AlignmentUtils.getNumHardClippedBases(read);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -66,10 +66,7 @@ import org.broadinstitute.variant.variantcontext.Genotype;
|
|||
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -135,20 +132,24 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
}
|
||||
|
||||
private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc, final GenotypeBuilder gb) {
|
||||
final HashMap<Allele, Integer> alleleCounts = new HashMap<Allele, Integer>();
|
||||
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
|
||||
|
||||
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
|
||||
if ( ! perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
|
||||
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
|
||||
|
||||
final HashMap<Allele, Integer> alleleCounts = new HashMap<>();
|
||||
for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); }
|
||||
|
||||
for ( final Allele allele : vc.getAlleles() ) {
|
||||
alleleCounts.put(allele, 0);
|
||||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
|
||||
if (! a.isInformative() ) continue; // read is non-informative
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (! a.isInformative() )
|
||||
continue; // read is non-informative
|
||||
if (!vc.getAlleles().contains(a.getMostLikelyAllele()))
|
||||
continue; // sanity check - shouldn't be needed
|
||||
alleleCounts.put(a.getMostLikelyAllele(), alleleCounts.get(a.getMostLikelyAllele()) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
|
||||
final int prevCount = alleleCounts.get(a.getMostLikelyAllele());
|
||||
final int incCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
|
||||
alleleCounts.put(a.getMostLikelyAllele(), prevCount + incCount);
|
||||
}
|
||||
|
||||
final int[] counts = new int[alleleCounts.size()];
|
||||
counts[0] = alleleCounts.get(vc.getReference());
|
||||
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,126 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* The depth of coverage of each allele per sample
|
||||
*
|
||||
* the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot
|
||||
* differentiate between reads that align over the event but aren't informative vs. those that aren't even
|
||||
* close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP).
|
||||
*/
|
||||
public class DepthPerSampleHC extends GenotypeAnnotation {
|
||||
public void annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
final ReferenceContext ref,
|
||||
final AlignmentContext stratifiedContext,
|
||||
final VariantContext vc,
|
||||
final Genotype g,
|
||||
final GenotypeBuilder gb,
|
||||
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
|
||||
if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
|
||||
return;
|
||||
|
||||
if (alleleLikelihoodMap == null )
|
||||
throw new IllegalStateException("DepthPerSampleHC can only be used with likelihood based annotations in the HaplotypeCaller");
|
||||
|
||||
// the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot
|
||||
// differentiate between reads that align over the event but aren't informative vs. those that aren't even
|
||||
// close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP).
|
||||
int dp = 0;
|
||||
|
||||
if ( alleleLikelihoodMap.isEmpty() ) {
|
||||
// there are no reads
|
||||
} else {
|
||||
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
|
||||
|
||||
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
|
||||
if ( ! alleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
|
||||
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + alleleLikelihoodMap.getAllelesSet());
|
||||
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
|
||||
if ( a.isInformative() ) {
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final int incCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
|
||||
dp += incCount;
|
||||
}
|
||||
}
|
||||
|
||||
gb.DP(dp);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
return Collections.singletonList(VCFConstants.DEPTH_KEY);
|
||||
}
|
||||
|
||||
public List<VCFFormatHeaderLine> getDescriptions() {
|
||||
return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0)));
|
||||
}
|
||||
}
|
||||
|
|
@ -300,7 +300,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
for (PileupElement p : sample.getValue().getBasePileup()) {
|
||||
|
||||
if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions
|
||||
if ( ! isUsableBase(p) ) // ignore deletions and bad MQ
|
||||
continue;
|
||||
|
||||
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
|
||||
|
|
@ -313,6 +313,20 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
return table;
|
||||
}
|
||||
|
||||
/**
|
||||
* Can the base in this pileup element be used in comparative tests?
|
||||
*
|
||||
* @param p the pileup element to consider
|
||||
*
|
||||
* @return true if this base is part of a meaningful read for comparison, false otherwise
|
||||
*/
|
||||
private static boolean isUsableBase(final PileupElement p) {
|
||||
return !( p.isDeletion() ||
|
||||
p.getMappingQual() == 0 ||
|
||||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE);
|
||||
}
|
||||
|
||||
private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) {
|
||||
|
||||
final boolean matchesRef = allele.equals(ref, true);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test contrasting the likelihoods of reads to their
|
||||
* most likely haplotypes. This is effectively testing for a differentiate quality in the modeling of the alt
|
||||
* allele than the reference allele.
|
||||
*/
|
||||
public class LikelihoodRankSumTest extends RankSumTest {
|
||||
@Override
|
||||
public List<String> getKeyNames() { return Arrays.asList("LikelihoodRankSum"); }
|
||||
|
||||
@Override
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("LikelihoodRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref haplotype likelihoods")); }
|
||||
|
||||
@Override
|
||||
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final MostLikelyAllele mostLikelyAllele) {
|
||||
if ( ! mostLikelyAllele.isInformative() ) throw new IllegalStateException("Should never have seen non-informative read " + read + " MostLikelyAllele " + mostLikelyAllele);
|
||||
return mostLikelyAllele.getLog10LikelihoodOfMostLikely();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Double getElementForRead(GATKSAMRecord read, int refLoc) {
|
||||
throw new IllegalStateException("This method should never have been called as getElementForRead(read,refloc,mostLikelyAllele) was overloaded");
|
||||
}
|
||||
}
|
||||
|
|
@ -47,14 +47,10 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -68,40 +64,19 @@ import java.util.*;
|
|||
* <p>The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*/
|
||||
public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
|
||||
@Override
|
||||
public List<String> getKeyNames() { return Arrays.asList("MQRankSum"); }
|
||||
|
||||
@Override
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); }
|
||||
|
||||
protected void fillQualsFromPileup(final List<Allele> allAlleles,
|
||||
final int refLoc,
|
||||
final ReadBackedPileup pileup,
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap,
|
||||
final List<Double> refQuals, final List<Double> altQuals) {
|
||||
|
||||
if (pileup != null && likelihoodMap == null) {
|
||||
// old UG snp-only path through the annotations
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( isUsableBase(p) ) {
|
||||
if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) {
|
||||
refQuals.add((double)p.getMappingQual());
|
||||
} else if ( allAlleles.contains(Allele.create(p.getBase()))) {
|
||||
altQuals.add((double)p.getMappingQual());
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
// BUGBUG: There needs to be a comparable isUsableBase check here
|
||||
if (! a.isInformative())
|
||||
continue; // read is non-informative
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add((double)el.getKey().getMappingQuality());
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add((double)el.getKey().getMappingQuality());
|
||||
}
|
||||
@Override
|
||||
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {
|
||||
return (double)read.getMappingQuality();
|
||||
}
|
||||
|
||||
}
|
||||
@Override
|
||||
protected Double getElementForPileupElement(final PileupElement p) {
|
||||
return (double)p.getRead().getMappingQuality();
|
||||
}
|
||||
}
|
||||
|
|
@ -56,6 +56,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota
|
|||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
|
||||
|
|
@ -77,55 +78,41 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn
|
|||
final Map<String, AlignmentContext> stratifiedContexts,
|
||||
final VariantContext vc,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap ) {
|
||||
int totalSize = 0, index = 0;
|
||||
int qualities[];
|
||||
if (stratifiedContexts != null) {
|
||||
|
||||
final List<Integer> qualities = new ArrayList<>();
|
||||
if ( stratifiedContexts != null ) {
|
||||
if ( stratifiedContexts.size() == 0 )
|
||||
return null;
|
||||
|
||||
for ( AlignmentContext context : stratifiedContexts.values() )
|
||||
totalSize += context.size();
|
||||
|
||||
qualities = new int[totalSize];
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
AlignmentContext context = sample.getValue();
|
||||
for (PileupElement p : context.getBasePileup() )
|
||||
index = fillMappingQualitiesFromPileupAndUpdateIndex(p.getRead(), index, qualities);
|
||||
for ( final Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
final AlignmentContext context = sample.getValue();
|
||||
for ( final PileupElement p : context.getBasePileup() )
|
||||
fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), p.getRepresentativeCount(), qualities);
|
||||
}
|
||||
}
|
||||
else if (perReadAlleleLikelihoodMap != null) {
|
||||
if ( perReadAlleleLikelihoodMap.size() == 0 )
|
||||
return null;
|
||||
|
||||
for ( PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() )
|
||||
totalSize += perReadLikelihoods.size();
|
||||
|
||||
qualities = new int[totalSize];
|
||||
for ( PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) {
|
||||
for (GATKSAMRecord read : perReadLikelihoods.getStoredElements())
|
||||
index = fillMappingQualitiesFromPileupAndUpdateIndex(read, index, qualities);
|
||||
|
||||
|
||||
}
|
||||
for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) {
|
||||
for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() )
|
||||
fillMappingQualitiesFromPileup(read.getMappingQuality(), (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1), qualities);
|
||||
}
|
||||
}
|
||||
else
|
||||
return null;
|
||||
|
||||
|
||||
|
||||
double rms = MathUtils.rms(qualities);
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.2f", rms));
|
||||
return map;
|
||||
final double rms = MathUtils.rms(qualities);
|
||||
return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.2f", rms));
|
||||
}
|
||||
|
||||
private static int fillMappingQualitiesFromPileupAndUpdateIndex(final GATKSAMRecord read, final int inputIdx, final int[] qualities) {
|
||||
int outputIdx = inputIdx;
|
||||
if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE )
|
||||
qualities[outputIdx++] = read.getMappingQuality();
|
||||
|
||||
return outputIdx;
|
||||
private static void fillMappingQualitiesFromPileup(final int mq, final int representativeCount, final List<Integer> qualities) {
|
||||
if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
|
||||
if ( representativeCount == 1 )
|
||||
qualities.add(mq);
|
||||
else
|
||||
qualities.addAll(Collections.nCopies(representativeCount, mq));
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); }
|
||||
|
|
|
|||
|
|
@ -53,9 +53,11 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.MannWhitneyU;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
@ -87,31 +89,33 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
if (genotypes == null || genotypes.size() == 0)
|
||||
return null;
|
||||
|
||||
final ArrayList<Double> refQuals = new ArrayList<Double>();
|
||||
final ArrayList<Double> altQuals = new ArrayList<Double>();
|
||||
final ArrayList<Double> refQuals = new ArrayList<>();
|
||||
final ArrayList<Double> altQuals = new ArrayList<>();
|
||||
|
||||
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
|
||||
PerReadAlleleLikelihoodMap indelLikelihoodMap = null;
|
||||
ReadBackedPileup pileup = null;
|
||||
|
||||
boolean usePileup = true;
|
||||
|
||||
if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context != null )
|
||||
pileup = context.getBasePileup();
|
||||
if ( stratifiedPerReadAlleleLikelihoodMap != null ) {
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName());
|
||||
if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
|
||||
fillQualsFromLikelihoodMap(vc.getAlleles(), vc.getStart(), likelihoodMap, refQuals, altQuals);
|
||||
usePileup = false;
|
||||
}
|
||||
}
|
||||
if (stratifiedPerReadAlleleLikelihoodMap != null )
|
||||
indelLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName());
|
||||
|
||||
if (indelLikelihoodMap != null && indelLikelihoodMap.isEmpty())
|
||||
indelLikelihoodMap = null;
|
||||
// treat an empty likelihood map as a null reference - will simplify contract with fillQualsFromPileup
|
||||
if (indelLikelihoodMap == null && pileup == null)
|
||||
continue;
|
||||
|
||||
fillQualsFromPileup(vc.getAlleles(), vc.getStart(), pileup, indelLikelihoodMap, refQuals, altQuals );
|
||||
// the old UG SNP-only path through the annotations
|
||||
if ( usePileup && stratifiedContexts != null ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context != null ) {
|
||||
final ReadBackedPileup pileup = context.getBasePileup();
|
||||
if ( pileup != null )
|
||||
fillQualsFromPileup(vc.getAlleles(), pileup, refQuals, altQuals);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (refQuals.isEmpty() && altQuals.isEmpty())
|
||||
|
||||
if ( refQuals.isEmpty() && altQuals.isEmpty() )
|
||||
return null;
|
||||
|
||||
final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
|
||||
|
|
@ -136,18 +140,89 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
|
||||
final Pair<Double, Double> testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1);
|
||||
|
||||
final Map<String, Object> map = new HashMap<String, Object>();
|
||||
final Map<String, Object> map = new HashMap<>();
|
||||
if (!Double.isNaN(testResults.first))
|
||||
map.put(getKeyNames().get(0), String.format("%.3f", testResults.first));
|
||||
return map;
|
||||
}
|
||||
|
||||
protected abstract void fillQualsFromPileup(final List<Allele> alleles,
|
||||
final int refLoc,
|
||||
final ReadBackedPileup readBackedPileup,
|
||||
final PerReadAlleleLikelihoodMap alleleLikelihoodMap,
|
||||
final List<Double> refQuals,
|
||||
final List<Double> altQuals);
|
||||
private void fillQualsFromPileup(final List<Allele> alleles,
|
||||
final ReadBackedPileup pileup,
|
||||
final List<Double> refQuals,
|
||||
final List<Double> altQuals) {
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( isUsableBase(p) ) {
|
||||
final Double value = getElementForPileupElement(p);
|
||||
if ( value == null )
|
||||
continue;
|
||||
|
||||
if ( alleles.get(0).equals(Allele.create(p.getBase(), true)) )
|
||||
refQuals.add(value);
|
||||
else if ( alleles.contains(Allele.create(p.getBase())) )
|
||||
altQuals.add(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void fillQualsFromLikelihoodMap(final List<Allele> alleles,
|
||||
final int refLoc,
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap,
|
||||
final List<Double> refQuals,
|
||||
final List<Double> altQuals) {
|
||||
for ( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if ( ! a.isInformative() )
|
||||
continue; // read is non-informative
|
||||
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
if ( isUsableRead(read, refLoc) ) {
|
||||
final Double value = getElementForRead(read, refLoc, a);
|
||||
if ( value == null )
|
||||
continue;
|
||||
|
||||
if ( a.getMostLikelyAllele().isReference() )
|
||||
refQuals.add(value);
|
||||
else if ( alleles.contains(a.getMostLikelyAllele()) )
|
||||
altQuals.add(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the element for the given read at the given reference position
|
||||
*
|
||||
* @param read the read
|
||||
* @param refLoc the reference position
|
||||
* @param mostLikelyAllele the most likely allele for this read
|
||||
* @return a Double representing the element to be used in the rank sum test, or null if it should not be used
|
||||
*/
|
||||
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final MostLikelyAllele mostLikelyAllele) {
|
||||
return getElementForRead(read, refLoc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the element for the given read at the given reference position
|
||||
*
|
||||
* @param read the read
|
||||
* @param refLoc the reference position
|
||||
* @return a Double representing the element to be used in the rank sum test, or null if it should not be used
|
||||
*/
|
||||
protected abstract Double getElementForRead(final GATKSAMRecord read, final int refLoc);
|
||||
|
||||
// TODO -- until the ReadPosRankSumTest stops treating these differently, we need to have separate methods for GATKSAMRecords and PileupElements. Yuck.
|
||||
|
||||
/**
|
||||
* Get the element for the given read at the given reference position
|
||||
*
|
||||
* By default this function returns null, indicating that the test doesn't support the old style of pileup calculations
|
||||
*
|
||||
* @param p the pileup element
|
||||
* @return a Double representing the element to be used in the rank sum test, or null if it should not be used
|
||||
*/
|
||||
protected Double getElementForPileupElement(final PileupElement p) {
|
||||
// does not work in pileup mode
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Can the base in this pileup element be used in comparative tests between ref / alt bases?
|
||||
|
|
@ -157,30 +232,33 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
* @param p the pileup element to consider
|
||||
* @return true if this base is part of a meaningful read for comparison, false otherwise
|
||||
*/
|
||||
public static boolean isUsableBase(final PileupElement p) {
|
||||
return isUsableBase(p, false);
|
||||
protected boolean isUsableBase(final PileupElement p) {
|
||||
return !(p.isDeletion() ||
|
||||
p.getMappingQual() == 0 ||
|
||||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE || // need the unBAQed quality score here
|
||||
p.getRead().isReducedRead() );
|
||||
}
|
||||
|
||||
/**
|
||||
* Can the base in this pileup element be used in comparative tests between ref / alt bases?
|
||||
* Can the read be used in comparative tests between ref / alt bases?
|
||||
*
|
||||
* @param p the pileup element to consider
|
||||
* @param allowDeletions if true, allow p to be a deletion base
|
||||
* @return true if this base is part of a meaningful read for comparison, false otherwise
|
||||
* @param read the read to consider
|
||||
* @param refLoc the reference location
|
||||
* @return true if this read is meaningful for comparison, false otherwise
|
||||
*/
|
||||
public static boolean isUsableBase(final PileupElement p, final boolean allowDeletions) {
|
||||
return !((! allowDeletions && p.isDeletion()) ||
|
||||
p.getMappingQual() == 0 ||
|
||||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
|
||||
protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
|
||||
return !( read.getMappingQuality() == 0 ||
|
||||
read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
read.isReducedRead() );
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if
|
||||
* engine randomization is turned off, and if so does not dither.
|
||||
* @param walker
|
||||
* @param toolkit
|
||||
* @param headerLines
|
||||
* @param walker the walker
|
||||
* @param toolkit the GATK engine
|
||||
* @param headerLines the header lines
|
||||
*/
|
||||
public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
|
||||
useDithering = ! toolkit.getArguments().disableDithering;
|
||||
|
|
|
|||
|
|
@ -51,17 +51,13 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -75,63 +71,44 @@ import java.util.*;
|
|||
*/
|
||||
public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
return Arrays.asList("ReadPosRankSum");
|
||||
}
|
||||
@Override
|
||||
public List<String> getKeyNames() { return Arrays.asList("ReadPosRankSum"); }
|
||||
|
||||
@Override
|
||||
public List<VCFInfoHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"));
|
||||
}
|
||||
|
||||
protected void fillQualsFromPileup(final List<Allele> allAlleles,
|
||||
final int refLoc,
|
||||
final ReadBackedPileup pileup,
|
||||
final PerReadAlleleLikelihoodMap alleleLikelihoodMap,
|
||||
final List<Double> refQuals, final List<Double> altQuals) {
|
||||
@Override
|
||||
protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {
|
||||
final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true );
|
||||
if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED )
|
||||
return null;
|
||||
|
||||
if (alleleLikelihoodMap == null) {
|
||||
// use old UG SNP-based version if we don't have per-read allele likelihoods
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( isUsableBase(p) && p.getRead().getCigar() != null ) {
|
||||
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);
|
||||
|
||||
readPos = getFinalReadPosition(p.getRead(),readPos);
|
||||
|
||||
if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) {
|
||||
refQuals.add((double)readPos);
|
||||
} else if ( allAlleles.contains(Allele.create(p.getBase()))) {
|
||||
altQuals.add((double)readPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (! a.isInformative() )
|
||||
continue; // read is non-informative
|
||||
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
if ( read.getSoftStart() + read.getCigar().getReadLength() <= refLoc ) { // make sure the read actually covers the requested ref loc
|
||||
continue;
|
||||
}
|
||||
final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true );
|
||||
if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED || read.getCigar() == null )
|
||||
continue;
|
||||
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 );
|
||||
final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
|
||||
if (readPos > numAlignedBases / 2)
|
||||
readPos = numAlignedBases - (readPos + 1);
|
||||
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add((double)readPos);
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add((double)readPos);
|
||||
}
|
||||
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 );
|
||||
final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
|
||||
if (readPos > numAlignedBases / 2)
|
||||
readPos = numAlignedBases - (readPos + 1);
|
||||
return (double)readPos;
|
||||
}
|
||||
|
||||
int getFinalReadPosition(GATKSAMRecord read, int initialReadPosition) {
|
||||
@Override
|
||||
protected Double getElementForPileupElement(final PileupElement p) {
|
||||
final int offset = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);
|
||||
return (double)getFinalReadPosition(p.getRead(), offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean isUsableBase(final PileupElement p) {
|
||||
return super.isUsableBase(p) && p.getRead().getCigar() != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
|
||||
return super.isUsableRead(read, refLoc) && read.getSoftStart() + read.getCigar().getReadLength() > refLoc;
|
||||
}
|
||||
|
||||
private int getFinalReadPosition(final GATKSAMRecord read, final int initialReadPosition) {
|
||||
final int numAlignedBases = getNumAlignedBases(read);
|
||||
|
||||
int readPos = initialReadPosition;
|
||||
|
|
@ -141,7 +118,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
return readPos;
|
||||
|
||||
}
|
||||
int getNumClippedBasesAtStart(SAMRecord read) {
|
||||
|
||||
private int getNumClippedBasesAtStart(final SAMRecord read) {
|
||||
// compute total number of clipped bases (soft or hard clipped)
|
||||
// check for hard clips (never consider these bases):
|
||||
final Cigar c = read.getCigar();
|
||||
|
|
@ -151,8 +129,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
if (first.getOperator() == CigarOperator.H) {
|
||||
numStartClippedBases = first.getLength();
|
||||
}
|
||||
byte[] unclippedReadBases = read.getReadBases();
|
||||
byte[] unclippedReadQuals = read.getBaseQualities();
|
||||
final byte[] unclippedReadBases = read.getReadBases();
|
||||
final byte[] unclippedReadQuals = read.getBaseQualities();
|
||||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
|
|
@ -167,11 +145,11 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
return numStartClippedBases;
|
||||
}
|
||||
|
||||
int getNumAlignedBases(SAMRecord read) {
|
||||
private int getNumAlignedBases(final GATKSAMRecord read) {
|
||||
return read.getReadLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
|
||||
}
|
||||
|
||||
int getNumClippedBasesAtEnd(SAMRecord read) {
|
||||
private int getNumClippedBasesAtEnd(final GATKSAMRecord read) {
|
||||
// compute total number of clipped bases (soft or hard clipped)
|
||||
// check for hard clips (never consider these bases):
|
||||
final Cigar c = read.getCigar();
|
||||
|
|
@ -181,8 +159,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
if (last.getOperator() == CigarOperator.H) {
|
||||
numEndClippedBases = last.getLength();
|
||||
}
|
||||
byte[] unclippedReadBases = read.getReadBases();
|
||||
byte[] unclippedReadQuals = read.getBaseQualities();
|
||||
final byte[] unclippedReadBases = read.getReadBases();
|
||||
final byte[] unclippedReadQuals = read.getBaseQualities();
|
||||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
|
|
@ -193,11 +171,6 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
break;
|
||||
}
|
||||
|
||||
|
||||
return numEndClippedBases;
|
||||
}
|
||||
|
||||
int getOffsetFromClippedReadStart(SAMRecord read, int offset) {
|
||||
return offset - getNumClippedBasesAtStart(read);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,587 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Tool to analyze and evaluate base recalibration ables.
|
||||
* <p/>
|
||||
* For now it generates a plot report to assess the quality of a recalibration.
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
*
|
||||
* The tool can take up to three different sets of recalibration tables.
|
||||
* The resulting plots will be overlaid on top of each other to make
|
||||
* comparisons easy.
|
||||
*
|
||||
* <table style="text-align: left">
|
||||
* <thead>
|
||||
* <tr><th>Set</th><th>Argument</th><th>Label</th><th>Color</th><th>Description</th></tr>
|
||||
* </thead>
|
||||
* <tbody>
|
||||
* <tr><td>Original</td><td>-before</td><td>BEFORE</td><td style="color: #ff34b3">Maroon1</td>
|
||||
* <td>First pass recalibration
|
||||
* tables obtained from applying {@link BaseRecalibration}
|
||||
* on the original alignment.</td></tr>
|
||||
* <tr><td>Recalibrated</td><td>-after</td><td>AFTER</td><td style="color: #0000ff">Blue</td>
|
||||
* <td>Second pass recalibration tables
|
||||
* results from the application of {@link BaseRecalibration}
|
||||
* on the alignment recalibrated using the first pass tables</td></tr>
|
||||
* <tr><td>Input</td><td>-BQSR</td><td>BQSR</td><td style="color: #000000">Black</td>
|
||||
* <td>Any recalibration table without a specific role</td></tr>
|
||||
* </tbody>
|
||||
* </table>
|
||||
* <br/>
|
||||
*
|
||||
* You need to specify one set at least. Multiple sets need to have the same values for the following parameters:
|
||||
* <br/></br>
|
||||
* <i>covariate (order is not important), no_standard_covs, run_without_dbsnp, solid_recal_mode,
|
||||
* solid_nocall_strategy, mismatches_context_size, mismatches_default_quality, deletions_default_quality,
|
||||
* insertions_default_quality, maximum_cycle_value, low_quality_tail, default_platform, force_platform,
|
||||
* quantizing_levels</i> and <i>binary_tag_name</i>
|
||||
* <h3>Output</h3>
|
||||
*
|
||||
* Currently this tool generates two outputs:
|
||||
*
|
||||
* <dl>
|
||||
* <dt style="font-weight: normal">-plots <i>my-report.pdf</i></dt>
|
||||
* <dd>A pdf document that encloses plots to assess the quality of the recalibration.</dd>
|
||||
* <dt style="font-weight: normal">-csv <i>my-report.csv</i></dt>
|
||||
* <dd>A csv file that contains a table with all the data required to generate those plots.</dd>
|
||||
* </dl>
|
||||
*
|
||||
* You need to specify at least one of them.
|
||||
*
|
||||
* <h3>Other Arguments</h3>
|
||||
*
|
||||
* <h4>-ignoreLMT, --ignoreLastModificationTimes</h4>
|
||||
*
|
||||
* when set, no warning message will be displayed in the -before recalibration table file is older than the -after one.
|
||||
*
|
||||
* <h3>Examples</h3>
|
||||
*
|
||||
*
|
||||
* <h4>Plot a single recalibration table</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T AnalyzeCovariates \
|
||||
* -R myrefernce.fasta \
|
||||
* -BQSR myrecal.table \
|
||||
* -plots BQSR.pdf
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Plot before (first pass) and after (second pass) recalibration table to compare them</h4>
|
||||
*
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T AnalyzeCovariates \
|
||||
* -R myrefernce.fasta \
|
||||
* -before recal2.table \
|
||||
* -after recal3.table \
|
||||
* -plots recalQC.pdf
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Plot up to three recalibration tables for comparison</h4>
|
||||
*
|
||||
* <pre>
|
||||
*
|
||||
* # You can ignore the before/after semantics completely if you like (if you do add -ignoreLMT
|
||||
* # to avoid a possible warning), but all tables should have been generated using the same parameters.
|
||||
*
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T AnalyzeCovariates \
|
||||
* -R myrefernce.fasta \
|
||||
* -ignoreLMT \
|
||||
* -BQSR recal1.table \ # you can discard any two
|
||||
* -before recal2.table \
|
||||
* -after recal3.table \
|
||||
* -plots myrecals.pdf
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Full BQSR quality assessment pipeline</h4>
|
||||
*
|
||||
* <pre>
|
||||
* # Generate the first pass recalibration table file.
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T BaseRecalibrator \
|
||||
* -R myreference.fasta \
|
||||
* -I myinput.bam \
|
||||
* -knownSites bundle/my-trusted-snps.vcf \ # optional but recommendable
|
||||
* -knownSites bundle/my-trusted-indels.vcf \ # optional but recommendable
|
||||
* ... other options
|
||||
* -o firstpass.table
|
||||
*
|
||||
* # Generate the second pass recalibration table file.
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T BaseRecalibrator \
|
||||
* -BQSR firstpass.table \
|
||||
* -R myreference.fasta \
|
||||
* -I myinput.bam \
|
||||
* -knownSites bundle/my-trusted-snps.vcf \
|
||||
* -knownSites bundle/my-trusted-indels.vcf \
|
||||
* ... other options \
|
||||
* -o secondpass.table
|
||||
*
|
||||
* # Finally generate the plots report and also keep a copy of the csv (optional).
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T AnalyzeCovariates \
|
||||
* -R myrefernce.fasta \
|
||||
* -before firstpass.table \
|
||||
* -after secondpass.table \
|
||||
* -csv BQSR.csv \ # optional
|
||||
* -plots BQSR.pdf
|
||||
* </pre>
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
* @version 6/16/2013
|
||||
* @since 2.6
|
||||
*/
|
||||
@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class})
|
||||
public final class AnalyzeCovariates extends RodWalker<AnalyzeCovariates.None,AnalyzeCovariates.None> {
|
||||
|
||||
|
||||
// Constants on option short names that are used in some error/warning messages:
|
||||
|
||||
static final String CSV_ARG_SHORT_NAME = "csv";
|
||||
static final String PDF_ARG_SHORT_NAME = "plots";
|
||||
static final String BEFORE_ARG_SHORT_NAME = "before";
|
||||
static final String AFTER_ARG_SHORT_NAME = "after";
|
||||
|
||||
/**
|
||||
* File containing the recalibration tables from the first pass.
|
||||
*/
|
||||
@Input(shortName=BEFORE_ARG_SHORT_NAME,fullName="beforeReportFile", doc = "file containing the BQSR first-pass report file",required = false)
|
||||
protected File beforeFile = null;
|
||||
|
||||
/**
|
||||
* File containing the recalibration tables from the second pass.
|
||||
*/
|
||||
@Input(shortName=AFTER_ARG_SHORT_NAME, fullName="afterReportFile", doc = "file containing the BQSR second-pass report file",required = false)
|
||||
protected File afterFile = null;
|
||||
|
||||
/**
|
||||
* If true, it won't show a warning if the last-modification time of the before and after input files suggest that they have been reversed.
|
||||
*/
|
||||
@Argument(shortName="ignoreLMT", fullName="ignoreLastModificationTimes", doc= "do not emit warning messages related to suspicious last modification time order of inputs", required = false)
|
||||
protected boolean ignoreLastModificationTime = false;
|
||||
|
||||
/**
|
||||
* Output report file name.
|
||||
*/
|
||||
@Output(shortName=PDF_ARG_SHORT_NAME, fullName="plotsReportFile" ,doc = "location of the output report", required = false)
|
||||
protected File pdfFile = null;
|
||||
|
||||
/**
|
||||
* Output csv file name.
|
||||
*/
|
||||
@Output(shortName=CSV_ARG_SHORT_NAME,fullName="intermediateCsvFile" ,doc = "location of the csv intermediate file", required = false)
|
||||
protected File csvFile = null;
|
||||
|
||||
/**
|
||||
* Convenience reference to the RECAL_BQSR_FILE argument value.
|
||||
* <p/>
|
||||
* This field value is resolved by {@link #initialize()}.
|
||||
*/
|
||||
protected File bqsrFile = null;
|
||||
|
||||
/**
|
||||
* Checks inputs and argument values.
|
||||
* <p/>
|
||||
* Notice that this routine will not validate the content of files. It may have some minor side effects as
|
||||
* the output of warning messages back to the user.
|
||||
*
|
||||
* @throw IllegalStateException there is some required argument value that has not been loaded yet.
|
||||
* @throw UserException if there is some error caused by or under the end user's control.
|
||||
*/
|
||||
private void checkArgumentsValues() {
|
||||
checkInputReportFile("BQSR",bqsrFile);
|
||||
checkInputReportFile("before",beforeFile);
|
||||
checkInputReportFile("after",afterFile);
|
||||
if (bqsrFile == null && beforeFile == null && afterFile == null) {
|
||||
throw new UserException("you must provide at least one recalibration report file "
|
||||
+ "(arguments -BQSR, -" + BEFORE_ARG_SHORT_NAME + " or -" + AFTER_ARG_SHORT_NAME);
|
||||
}
|
||||
|
||||
checkOutputFile(PDF_ARG_SHORT_NAME,pdfFile);
|
||||
checkOutputFile(CSV_ARG_SHORT_NAME, csvFile);
|
||||
checkInputReportFileLMT(beforeFile,afterFile);
|
||||
checkOutputRequested();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the last-modification-time of the inputs is consistent with their relative roles.
|
||||
*
|
||||
* This routine does not thrown an exception but may output a warning message if inconsistencies are spotted.
|
||||
*
|
||||
* @param beforeFile the before report file.
|
||||
* @param afterFile the after report file.
|
||||
*/
|
||||
private void checkInputReportFileLMT(final File beforeFile, final File afterFile) {
|
||||
|
||||
if (ignoreLastModificationTime || beforeFile == null || afterFile == null) {
|
||||
return; // nothing to do here
|
||||
} else if (beforeFile.lastModified() > afterFile.lastModified()) {
|
||||
Utils.warnUser("Last modification timestamp for 'Before' and 'After'"
|
||||
+ "recalibration reports are in the wrong order. Perhaps, have they been swapped?");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks that at least one output was requested.
|
||||
*
|
||||
* @throw UserException if no output was requested.
|
||||
*/
|
||||
private void checkOutputRequested() {
|
||||
if (pdfFile == null && csvFile == null) {
|
||||
throw new UserException("you need to request at least one output:"
|
||||
+ " the intermediate csv file (-" + CSV_ARG_SHORT_NAME + " FILE)"
|
||||
+ " or the final plot file (-" + PDF_ARG_SHORT_NAME + " FILE).");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the value provided to input file arguments.
|
||||
*
|
||||
* @throw UserException if there is any problem cause by or under the end user's control
|
||||
*
|
||||
* @param name command line argument short name.
|
||||
* @param value the argument value.
|
||||
*/
|
||||
private void checkInputReportFile(final String name,final File value) {
|
||||
if (value == null) {
|
||||
return;
|
||||
} else if (!value.exists()) {
|
||||
throw new UserException.BadArgumentValue(name, "input report '" +
|
||||
value + "' does not exist or is unreachable");
|
||||
} else if (!value.isFile()) {
|
||||
throw new UserException.BadArgumentValue(name, "input report '" +
|
||||
value + "' is not a regular file");
|
||||
} else if (!value.canRead()) {
|
||||
throw new UserException.BadArgumentValue(name, "input report '" +
|
||||
value + "' cannot be read");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the value provided for output arguments.
|
||||
*
|
||||
* @throw UserException if there is any problem cause by or under the end user's control
|
||||
*
|
||||
* @param name command line argument short name.
|
||||
* @param value the argument value.
|
||||
*/
|
||||
private void checkOutputFile(final String name, final File value) {
|
||||
if (value == null) {
|
||||
return;
|
||||
}
|
||||
if (value.exists() && !value.isFile()) {
|
||||
throw new UserException.BadArgumentValue(name, "the output file location '"
|
||||
+ value + "' exists as not a file");
|
||||
}
|
||||
final File parent = value.getParentFile();
|
||||
if (parent == null) {
|
||||
return;
|
||||
}
|
||||
if (!parent.exists()) {
|
||||
throw new UserException.BadArgumentValue(name, "the output file parent directory '"
|
||||
+ parent + "' does not exists or is unreachable");
|
||||
} else if (!parent.isDirectory()) {
|
||||
throw new UserException.BadArgumentValue(name, "the output file parent directory '"
|
||||
+ parent + "' is not a directory");
|
||||
} else if (!parent.canWrite()) {
|
||||
throw new UserException.BadArgumentValue(name, "the output file parent directory '"
|
||||
+ parent + "' cannot be written");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates the plots using the external R script.
|
||||
*
|
||||
* <p/>
|
||||
* If <code>plotsFile</code> is <code>null</code>, it does not perform any plotting.
|
||||
*
|
||||
* @param csvFile the intermediary csv file.
|
||||
* @param plotsFile the output plot location.
|
||||
*/
|
||||
private void generatePlots(final File csvFile, final Map<String,File> reportFiles, final File plotsFile) {
|
||||
|
||||
if (plotsFile == null) {
|
||||
return;
|
||||
}
|
||||
logger.info("Generating plots file '" + plotsFile + "'");
|
||||
final File exampleReportFile = reportFiles.values().iterator().next();
|
||||
RecalUtils.generatePlots(csvFile,exampleReportFile,plotsFile);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
bqsrFile = getToolkit().getArguments().BQSR_RECAL_FILE;
|
||||
checkArgumentsValues();
|
||||
final Map<String, File> reportFiles = buildReportFileMap();
|
||||
final Map<String, RecalibrationReport> reports = buildReportMap(reportFiles);
|
||||
checkReportConsistency(reports);
|
||||
final File csvFile = resolveCsvFile();
|
||||
generateCsvFile(csvFile,reports);
|
||||
final File plotFile = resolvePlotFile();
|
||||
generatePlots(csvFile, reportFiles, plotFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the plot output file
|
||||
* @return might be <code>null</code> if the user has not indicated and output file.
|
||||
*/
|
||||
private File resolvePlotFile() {
|
||||
return pdfFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates the intermediary Csv file.
|
||||
*
|
||||
* @param csvFile where to write the file.
|
||||
* @param reports the reports to be included.
|
||||
*/
|
||||
private void generateCsvFile(final File csvFile, final Map<String, RecalibrationReport> reports) {
|
||||
try {
|
||||
logger.info("Generating csv file '" + csvFile + "'");
|
||||
RecalUtils.generateCsv(csvFile, reports);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException(
|
||||
String.format("There is a problem creating the intermediary Csv file '%s': %s",
|
||||
csvFile,e.getMessage()),e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether multiple input recalibration report files argument values are consistent (equal).
|
||||
*
|
||||
* @param reports map with report to verify.
|
||||
*
|
||||
* @throw UserException if there is any inconsistency.
|
||||
*/
|
||||
private void checkReportConsistency(final Map<String, RecalibrationReport> reports) {
|
||||
final Map.Entry<String,RecalibrationReport>[] reportEntries =
|
||||
reports.entrySet().toArray((Map.Entry<String,RecalibrationReport>[]) new Map.Entry[reports.size()]);
|
||||
|
||||
final Map.Entry<String,RecalibrationReport> exampleEntry = reportEntries[0];
|
||||
|
||||
for (int i = 1; i < reportEntries.length; i++) {
|
||||
final Map<String,? extends CharSequence> diffs = exampleEntry.getValue().getRAC().compareReportArguments(
|
||||
reportEntries[i].getValue().getRAC(),exampleEntry.getKey(),reportEntries[i].getKey());
|
||||
if (diffs.size() != 0) {
|
||||
throw new UserException.IncompatibleRecalibrationTableParameters("There are differences in relevant arguments of"
|
||||
+ " two or more input recalibration reports. Please make sure"
|
||||
+ " they have been created using the same recalibration parameters."
|
||||
+ " " + Utils.join("// ", reportDifferencesStringArray(diffs)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a map with all input recalibration files indexed by their "role".
|
||||
* <p/>
|
||||
* The key is the role and the value the corresponding report file.
|
||||
* <p/>
|
||||
* Roles: "Before" (recalibration), "After" (recalibration), "BQSR" (the tool standard argument recalibration file)
|
||||
*
|
||||
* @return never <code>null</code>
|
||||
*/
|
||||
private Map<String, File> buildReportFileMap() {
|
||||
final Map<String,File> reports = new LinkedHashMap<>(3);
|
||||
if (bqsrFile != null) {
|
||||
reports.put("BQSR",bqsrFile);
|
||||
}
|
||||
if (beforeFile != null) {
|
||||
reports.put("Before",beforeFile);
|
||||
}
|
||||
if (afterFile != null) {
|
||||
reports.put("After",afterFile);
|
||||
}
|
||||
return reports;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms a recalibration file map into a report object map.
|
||||
*
|
||||
* @param reportFileMap the file map to transforms.
|
||||
* @return never <code>null</code>, a new map with the same size as
|
||||
* <code>reportFileMap</code> and the same key set.
|
||||
*/
|
||||
@Requires("reportFileMap != null")
|
||||
private Map<String, RecalibrationReport> buildReportMap(final Map<String, File> reportFileMap) {
|
||||
final Map<String,RecalibrationReport> reports = new LinkedHashMap<>(reportFileMap.size());
|
||||
for (final Map.Entry<String,File> e : reportFileMap.entrySet()) {
|
||||
reports.put(e.getKey(),new RecalibrationReport(e.getValue()));
|
||||
}
|
||||
return reports;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a flatter String array representation of recalibration argument differences.
|
||||
* @param diffs the differences to represent.
|
||||
*
|
||||
* @return never <code>null</code>, an array of the same length as the size of the input <code>diffs</code>.
|
||||
*/
|
||||
@Requires("diffs != null")
|
||||
private String[] reportDifferencesStringArray(final Map<String, ? extends CharSequence> diffs) {
|
||||
final String[] result = new String[diffs.size()];
|
||||
int i = 0;
|
||||
for (final Map.Entry<String, ? extends CharSequence> e : diffs.entrySet()) {
|
||||
result[i++] = capitalize(e.getKey()) + ": " + e.getValue();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the input string capitalizing the first letter.
|
||||
*
|
||||
* @param str the string to capitalize
|
||||
* @return never <code>null</code>.
|
||||
*/
|
||||
@Requires("str != null")
|
||||
private String capitalize(final String str) {
|
||||
if (str.isEmpty()) {
|
||||
return str;
|
||||
} else {
|
||||
return Character.toUpperCase(str.charAt(0)) + str.substring(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the csv file to use.
|
||||
* <p/>
|
||||
* This is the the one specified by the user if any or a temporary file
|
||||
* that will be deleted as soon as the VM exists by default.
|
||||
*
|
||||
* @return never <code>null</code>.
|
||||
*/
|
||||
private File resolveCsvFile() {
|
||||
if (csvFile != null) {
|
||||
return csvFile;
|
||||
} else {
|
||||
try {
|
||||
final File result = File.createTempFile("AnalyzeCovariates", ".csv");
|
||||
result.deleteOnExit();
|
||||
return result;
|
||||
} catch (IOException e) {
|
||||
throw new UserException("Could not create temporary Csv file",e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Always return true, forcing the immediate termination of the travesal.
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean isDone() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public None reduceInit() {
|
||||
return new None();
|
||||
}
|
||||
|
||||
/**
|
||||
* Is not supposed to ever be called, thus it always results in an exception.
|
||||
*
|
||||
* @throws IllegalStateException always.
|
||||
*/
|
||||
@Override
|
||||
public None reduce(None value, None sum) {
|
||||
throw new IllegalStateException("AnalyzeCovariates reduce method is not supposed to be invoked ever");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Is not supposed to ever be called, thus it always results in an exception.
|
||||
*
|
||||
* @throws IllegalStateException always.
|
||||
*/
|
||||
@Override
|
||||
public None map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
throw new IllegalStateException("AnalyzeCovariates map method is not supposed to be invoked ever");
|
||||
}
|
||||
|
||||
/**
|
||||
* Dummy map and reduce types for the {@link AnalyzeCovariates} tool that in fact does not do any traversal.
|
||||
*/
|
||||
protected static class None {
|
||||
private None() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -92,18 +92,6 @@ public class BQSRGatherer extends Gatherer {
|
|||
|
||||
generalReport.calculateQuantizedQualities();
|
||||
|
||||
RecalibrationArgumentCollection RAC = generalReport.getRAC();
|
||||
if ( RAC.RECAL_PDF_FILE != null ) {
|
||||
RAC.RECAL_TABLE_FILE = output;
|
||||
if ( RAC.existingRecalibrationReport != null ) {
|
||||
final RecalibrationReport originalReport = new RecalibrationReport(RAC.existingRecalibrationReport);
|
||||
RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates());
|
||||
}
|
||||
else {
|
||||
RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates());
|
||||
}
|
||||
}
|
||||
|
||||
generalReport.output(outputFile);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -124,7 +125,7 @@ import java.util.List;
|
|||
* -R resources/Homo_sapiens_assembly18.fasta \
|
||||
* -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
|
||||
* -knownSites another/optional/setOfSitesToMask.vcf \
|
||||
* -o recal_data.grp
|
||||
* -o recal_data.table
|
||||
* </pre>
|
||||
*/
|
||||
|
||||
|
|
@ -366,9 +367,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
|
|||
}
|
||||
|
||||
protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) {
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final int[] indel = new int[readBases.length];
|
||||
Arrays.fill(indel, 0);
|
||||
final int[] indel = new int[read.getReadBases().length];
|
||||
int readPos = 0;
|
||||
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
|
||||
final int elementLength = ce.getLength();
|
||||
|
|
@ -383,21 +382,19 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
|
|||
}
|
||||
case D:
|
||||
{
|
||||
final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) );
|
||||
indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 );
|
||||
final int index = ( read.getReadNegativeStrandFlag() ? readPos : readPos - 1 );
|
||||
updateIndel(indel, index, mode, EventType.BASE_DELETION);
|
||||
break;
|
||||
}
|
||||
case I:
|
||||
{
|
||||
final boolean forwardStrandRead = !read.getReadNegativeStrandFlag();
|
||||
if( forwardStrandRead ) {
|
||||
indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
|
||||
}
|
||||
for (int iii = 0; iii < elementLength; iii++) {
|
||||
readPos++;
|
||||
updateIndel(indel, readPos - 1, mode, EventType.BASE_INSERTION);
|
||||
}
|
||||
readPos += elementLength;
|
||||
if( !forwardStrandRead ) {
|
||||
indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
|
||||
updateIndel(indel, readPos, mode, EventType.BASE_INSERTION);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
@ -412,6 +409,12 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
|
|||
return indel;
|
||||
}
|
||||
|
||||
private static void updateIndel(final int[] indel, final int index, final EventType mode, final EventType requiredMode) {
|
||||
if ( mode == requiredMode && index >= 0 && index < indel.length )
|
||||
// protect ourselves from events at the start or end of the read (1D3M or 3M1D)
|
||||
indel[index] = 1;
|
||||
}
|
||||
|
||||
protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) {
|
||||
if(errorArray.length != baqArray.length ) {
|
||||
throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
|
||||
|
|
@ -514,28 +517,13 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
|
|||
generateReport();
|
||||
logger.info("...done!");
|
||||
|
||||
if ( RAC.RECAL_PDF_FILE != null ) {
|
||||
logger.info("Generating recalibration plots...");
|
||||
generatePlots();
|
||||
}
|
||||
|
||||
logger.info("Processed: " + result + " reads");
|
||||
logger.info("BaseRecalibrator was able to recalibrate " + result + " reads");
|
||||
}
|
||||
|
||||
private RecalibrationTables getRecalibrationTable() {
|
||||
return recalibrationEngine.getFinalRecalibrationTables();
|
||||
}
|
||||
|
||||
private void generatePlots() {
|
||||
File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE;
|
||||
if (recalFile != null) {
|
||||
RecalibrationReport report = new RecalibrationReport(recalFile);
|
||||
RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), getRecalibrationTable(), requestedCovariates);
|
||||
}
|
||||
else
|
||||
RecalUtils.generateRecalibrationPlot(RAC, getRecalibrationTable(), requestedCovariates);
|
||||
}
|
||||
|
||||
/**
|
||||
* go through the quality score table and use the # observations and the empirical quality score
|
||||
* to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to
|
||||
|
|
|
|||
|
|
@ -46,15 +46,17 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -65,7 +67,7 @@ import java.util.List;
|
|||
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
|
||||
*/
|
||||
|
||||
public class RecalibrationArgumentCollection {
|
||||
public class RecalibrationArgumentCollection implements Cloneable {
|
||||
|
||||
/**
|
||||
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
|
||||
|
|
@ -87,21 +89,6 @@ public class RecalibrationArgumentCollection {
|
|||
public File RECAL_TABLE_FILE = null;
|
||||
public PrintStream RECAL_TABLE;
|
||||
|
||||
/**
|
||||
* If not provided, then no plots will be generated (useful for queue scatter/gathering).
|
||||
* However, we *highly* recommend that users generate these plots whenever possible for QC checking.
|
||||
*/
|
||||
@Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false, defaultToStdout = false)
|
||||
public File RECAL_PDF_FILE = null;
|
||||
|
||||
/**
|
||||
* If not provided, then a temporary file is created and then deleted upon completion.
|
||||
* For advanced users only.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false)
|
||||
public File RECAL_CSV_FILE = null;
|
||||
|
||||
/**
|
||||
* Note that the --list argument requires a fully resolved and correct command-line to work.
|
||||
*/
|
||||
|
|
@ -282,11 +269,147 @@ public class RecalibrationArgumentCollection {
|
|||
argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
|
||||
argumentsTable.addRowID("recalibration_report", true);
|
||||
argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath());
|
||||
argumentsTable.addRowID("plot_pdf_file", true);
|
||||
argumentsTable.set("plot_pdf_file", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RECAL_PDF_FILE == null ? "null" : RECAL_PDF_FILE.getAbsolutePath());
|
||||
argumentsTable.addRowID("binary_tag_name", true);
|
||||
argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME);
|
||||
return argumentsTable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map with the arguments that differ between this an
|
||||
* another {@link RecalibrationArgumentCollection} instance.
|
||||
* <p/>
|
||||
* The key is the name of that argument in the report file. The value is a message
|
||||
* that explains the difference to the end user.
|
||||
* <p/>
|
||||
* Thus, a empty map indicates that there is no differences between both argument collection that
|
||||
* is relevant to report comparison.
|
||||
* <p/>
|
||||
* This method should not throw any exception.
|
||||
*
|
||||
* @param other the argument-collection to compare against.
|
||||
* @param thisRole the name used to refer to this RAC report that makes sense to the end user.
|
||||
* @param otherRole the name used to refer to the other RAC report that makes sense to the end user.
|
||||
*
|
||||
* @return never <code>null</code>, but a zero-size collection if there are no differences.
|
||||
*/
|
||||
@Requires("other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)")
|
||||
Map<String,? extends CharSequence> compareReportArguments(final RecalibrationArgumentCollection other,final String thisRole, final String otherRole) {
|
||||
final Map<String,String> result = new LinkedHashMap<>(15);
|
||||
compareRequestedCovariates(result, other, thisRole, otherRole);
|
||||
compareSimpleReportArgument(result,"no_standard_covs", DO_NOT_USE_STANDARD_COVARIATES, other.DO_NOT_USE_STANDARD_COVARIATES, thisRole, otherRole);
|
||||
compareSimpleReportArgument(result,"run_without_dbsnp",RUN_WITHOUT_DBSNP,other.RUN_WITHOUT_DBSNP,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"solid_nocall_strategy", SOLID_NOCALL_STRATEGY, other.SOLID_NOCALL_STRATEGY,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"mismatches_context_size", MISMATCHES_CONTEXT_SIZE,other.MISMATCHES_CONTEXT_SIZE,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"mismatches_default_quality", MISMATCHES_DEFAULT_QUALITY, other.MISMATCHES_DEFAULT_QUALITY,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"deletions_default_quality", DELETIONS_DEFAULT_QUALITY, other.DELETIONS_DEFAULT_QUALITY,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"insertions_default_quality", INSERTIONS_DEFAULT_QUALITY, other.INSERTIONS_DEFAULT_QUALITY,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"maximum_cycle_value", MAXIMUM_CYCLE_VALUE, other.MAXIMUM_CYCLE_VALUE,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"quantizing_levels", QUANTIZING_LEVELS, other.QUANTIZING_LEVELS,thisRole,otherRole);
|
||||
compareSimpleReportArgument(result,"binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME,thisRole,otherRole);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compares the covariate report lists.
|
||||
*
|
||||
* @param diffs map where to annotate the difference.
|
||||
* @param other the argument collection to compare against.
|
||||
* @param thisRole the name for this argument collection that makes sense to the user.
|
||||
* @param otherRole the name for the other argument collection that makes sense to the end user.
|
||||
*
|
||||
* @return <code>true</code> if a difference was found.
|
||||
*/
|
||||
@Requires("diffs != null && other != null && thisRole != null && otherRole != null")
|
||||
private boolean compareRequestedCovariates(final Map<String,String> diffs,
|
||||
final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) {
|
||||
|
||||
final Set<String> beforeNames = new HashSet<>(this.COVARIATES.length);
|
||||
final Set<String> afterNames = new HashSet<>(other.COVARIATES.length);
|
||||
Utils.addAll(beforeNames, this.COVARIATES);
|
||||
Utils.addAll(afterNames,other.COVARIATES);
|
||||
final Set<String> intersect = new HashSet<>(Math.min(beforeNames.size(),afterNames.size()));
|
||||
intersect.addAll(beforeNames);
|
||||
intersect.retainAll(afterNames);
|
||||
|
||||
String diffMessage = null;
|
||||
if (intersect.size() == 0) { // In practice this is not possible due to required covariates but...
|
||||
diffMessage = String.format("There are no common covariates between '%s' and '%s'"
|
||||
+ " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",thisRole,otherRole,
|
||||
thisRole,Utils.join(", ",this.COVARIATES),
|
||||
otherRole,Utils.join(",",other.COVARIATES));
|
||||
} else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) {
|
||||
beforeNames.removeAll(intersect);
|
||||
afterNames.removeAll(intersect);
|
||||
diffMessage = String.format("There are differences in the set of covariates requested in the"
|
||||
+ " '%s' and '%s' recalibrator reports. "
|
||||
+ " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",thisRole,otherRole,
|
||||
thisRole,Utils.join(", ",beforeNames),
|
||||
otherRole,Utils.join(", ",afterNames));
|
||||
}
|
||||
if (diffMessage != null) {
|
||||
diffs.put("covariate",diffMessage);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Annotates a map with any difference encountered in a simple value report argument that differs between this an
|
||||
* another {@link RecalibrationArgumentCollection} instance.
|
||||
* <p/>
|
||||
* The key of the new entry would be the name of that argument in the report file. The value is a message
|
||||
* that explains the difference to the end user.
|
||||
* <p/>
|
||||
*
|
||||
* <p/>
|
||||
* This method should not return any exception.
|
||||
*
|
||||
* @param diffs where to annotate the differences.
|
||||
* @param name the name of the report argument to compare.
|
||||
* @param thisValue this argument collection value for that argument.
|
||||
* @param otherValue the other collection value for that argument.
|
||||
* @param thisRole the name used to refer to this RAC report that makes sense to the end user.
|
||||
* @param otherRole the name used to refer to the other RAC report that makes sense to the end user.
|
||||
*
|
||||
* @type T the argument Object value type.
|
||||
*
|
||||
* @return <code>true</code> if a difference has been spotted, thus <code>diff</code> has been modified.
|
||||
*/
|
||||
private <T> boolean compareSimpleReportArgument(final Map<String,String> diffs,
|
||||
final String name, final T thisValue, final T otherValue, final String thisRole, final String otherRole) {
|
||||
if (thisValue == null && otherValue == null) {
|
||||
return false;
|
||||
} else if (thisValue != null && thisValue.equals(otherValue)) {
|
||||
return false;
|
||||
} else {
|
||||
diffs.put(name,
|
||||
String.format("differences between '%s' {%s} and '%s' {%s}.",
|
||||
thisRole,thisValue == null ? "" : thisValue,
|
||||
otherRole,otherValue == null ? "" : otherValue));
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a shallow copy of this argument collection.
|
||||
*
|
||||
* @return never <code>null</code>.
|
||||
*/
|
||||
@Override
|
||||
public RecalibrationArgumentCollection clone() {
|
||||
try {
|
||||
return (RecalibrationArgumentCollection) super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new StingException("Unreachable code clone not supported thrown when the class "
|
||||
+ this.getClass().getName() + " is cloneable ",e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -207,7 +207,7 @@ public class HeaderElement {
|
|||
public void removeInsertionToTheRight() {
|
||||
this.insertionsToTheRight--;
|
||||
if (insertionsToTheRight < 0)
|
||||
throw new ReviewedStingException("Removed too many insertions, header is now negative!");
|
||||
throw new ReviewedStingException("Removed too many insertions, header is now negative at position " + location);
|
||||
}
|
||||
|
||||
public boolean hasInsertionToTheRight() {
|
||||
|
|
|
|||
|
|
@ -273,8 +273,9 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
|
||||
int nCompressedReads = 0;
|
||||
|
||||
Object2LongOpenHashMap<String> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
private static int READ_NAME_HASH_DEFAULT_SIZE = 1000;
|
||||
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
|
||||
Object2LongOpenHashMap<String> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
|
||||
ObjectSortedSet<GenomeLoc> intervalList;
|
||||
|
||||
|
|
@ -313,28 +314,37 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
GenomeAnalysisEngine toolkit = getToolkit();
|
||||
readNameHash = new Object2LongOpenHashMap<String>(100000); // prepare the read name hash to keep track of what reads have had their read names compressed
|
||||
this.resetReadNameHash(); // prepare the read name hash to keep track of what reads have had their read names compressed
|
||||
intervalList = new ObjectAVLTreeSet<GenomeLoc>(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
|
||||
|
||||
if (toolkit.getIntervals() != null)
|
||||
intervalList.addAll(toolkit.getIntervals());
|
||||
|
||||
final boolean preSorted = true;
|
||||
final boolean indexOnTheFly = true;
|
||||
final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
|
||||
if (nwayout) {
|
||||
SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
|
||||
writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, preSorted, indexOnTheFly, NO_PG_TAG, programRecord, true);
|
||||
writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, false, indexOnTheFly, NO_PG_TAG, programRecord, true);
|
||||
}
|
||||
else {
|
||||
writerToUse = out;
|
||||
out.setPresorted(false);
|
||||
if (!NO_PG_TAG) {
|
||||
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME);
|
||||
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Initializer for {@link #readNameHash}. */
|
||||
private void resetReadNameHash() {
|
||||
// If the hash grows large, subsequent clear operations can be very expensive, so trim the hash down if it grows beyond its default.
|
||||
if (readNameHash == null || readNameHash.size() > READ_NAME_HASH_DEFAULT_SIZE) {
|
||||
readNameHash = new Object2LongOpenHashMap<String>(READ_NAME_HASH_DEFAULT_SIZE);
|
||||
} else {
|
||||
readNameHash.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes in a read and prepares it for the SlidingWindow machinery by performing the
|
||||
* following optional clipping operations:
|
||||
|
|
@ -471,7 +481,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
// stash.compress(), the readNameHash can be cleared after the for() loop above.
|
||||
// The advantage of clearing the hash is that otherwise it holds all reads that have been encountered,
|
||||
// which can use a lot of memory and cause RR to slow to a crawl and/or run out of memory.
|
||||
readNameHash.clear();
|
||||
this.resetReadNameHash();
|
||||
|
||||
}
|
||||
} else
|
||||
|
|
|
|||
|
|
@ -878,7 +878,7 @@ public class SlidingWindow {
|
|||
int stop = region.getStop() - windowHeaderStart;
|
||||
|
||||
// make sure the bitset is complete given the region (it might not be in multi-sample mode)
|
||||
if ( region.getStop() > markedSites.getStartLocation() + markedSites.getVariantSiteBitSet().length )
|
||||
if ( region.getStop() > markedSites.getStartLocation() + markedSites.getVariantSiteBitSet().length - 1 )
|
||||
markSites(region.getStop());
|
||||
|
||||
CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions);
|
||||
|
|
@ -1199,7 +1199,7 @@ public class SlidingWindow {
|
|||
}
|
||||
|
||||
// Special case for leading insertions before the beginning of the sliding read
|
||||
if ( ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == headerStart || headerStart < 0) ) {
|
||||
if ( (readStart == headerStart || headerStart < 0) && ReadUtils.readStartsWithInsertion(read.getCigar(), false) != null ) {
|
||||
// create a new first element to the window header with no bases added
|
||||
header.addFirst(new HeaderElement(readStart - 1));
|
||||
// this allows the first element (I) to look at locationIndex - 1 when we update the header and do the right thing
|
||||
|
|
|
|||
|
|
@ -63,6 +63,10 @@ abstract class AbstractStratification {
|
|||
private Map<CallableStatus, Integer> statusTally = null;
|
||||
protected ThresHolder thresholds;
|
||||
|
||||
public AbstractStratification(ThresHolder thresholds) {
|
||||
this.thresholds = thresholds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the average "good" coverage of this sample. Good means "passes the base and
|
||||
* mapping quality requirements.
|
||||
|
|
@ -116,11 +120,11 @@ abstract class AbstractStratification {
|
|||
*
|
||||
* @return the callable status(es) for the whole object
|
||||
*/
|
||||
public abstract Iterable<CallableStatus> callableStatuses();
|
||||
public abstract List<CallableStatus> callableStatuses();
|
||||
|
||||
|
||||
/**
|
||||
* Tally up all the callable status of all the loci in this sample.
|
||||
* Tally up all the callable status of all elements of the stratification.
|
||||
*
|
||||
* @return a map of callable status and counts
|
||||
*/
|
||||
|
|
@ -136,10 +140,10 @@ abstract class AbstractStratification {
|
|||
return statusTally;
|
||||
}
|
||||
|
||||
public static List<CallableStatus> queryStatus(List<Metric> statList, AbstractStratification stratification) {
|
||||
public List<CallableStatus> queryStatus(List<Metric> statList) {
|
||||
List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
for (Metric stat : statList) {
|
||||
final CallableStatus status = stat.status(stratification);
|
||||
final CallableStatus status = stat.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import org.broadinstitute.sting.commandline.Output;
|
|||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
|
@ -65,6 +66,7 @@ import org.broadinstitute.variant.variantcontext.*;
|
|||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -109,9 +111,13 @@ import java.util.*;
|
|||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
|
||||
@By(value = DataSource.READS)
|
||||
@PartitionBy(PartitionType.INTERVAL)
|
||||
@Downsample(by = DownsampleType.NONE)
|
||||
public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||
|
||||
private static final String AVG_INTERVAL_DP_KEY = "IDP";
|
||||
private static final String LOW_COVERAGE_LOCI = "LL";
|
||||
private static final String ZERO_COVERAGE_LOCI = "ZL";
|
||||
|
||||
|
||||
@Output(doc = "File to which interval statistics should be written")
|
||||
private VariantContextWriter vcfWriter = null;
|
||||
|
|
@ -119,13 +125,12 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
@ArgumentCollection
|
||||
private ThresHolder thresholds = new ThresHolder();
|
||||
|
||||
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
|
||||
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
|
||||
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
|
||||
private Set<String> samples = null; // all the samples being processed
|
||||
private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
|
||||
private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
|
||||
|
||||
private static final int INITIAL_HASH_SIZE = 500000;
|
||||
private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
|
|
@ -134,7 +139,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty())
|
||||
throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead.");
|
||||
|
||||
intervalMap = new HashMap<GenomeLoc, IntervalStratification>(INITIAL_HASH_SIZE);
|
||||
intervalMap = new LinkedHashMap<GenomeLoc, IntervalStratification>(INITIAL_HASH_SIZE);
|
||||
intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
|
||||
|
||||
// get all of the unique sample names for the VCF Header
|
||||
|
|
@ -146,13 +151,13 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
public Long map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
GenomeLoc refLocus = ref.getLocus();
|
||||
|
||||
// process and remove any intervals in the map that are don't overlap the current locus anymore
|
||||
// and add all new intervals that may overlap this reference locus
|
||||
outputFinishedIntervals(refLocus, ref.getBase());
|
||||
addNewOverlappingIntervals(refLocus);
|
||||
outputFinishedIntervals(refLocus, ref.getBase());
|
||||
|
||||
// at this point, all intervals in intervalMap overlap with this locus, so update all of them
|
||||
for (IntervalStratification intervalStratification : intervalMap.values())
|
||||
|
|
@ -184,7 +189,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
* @param result number of loci processed by the walker
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Long result) {
|
||||
public void onTraversalDone(final Long result) {
|
||||
for (GenomeLoc interval : intervalMap.keySet())
|
||||
outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
|
||||
|
||||
|
|
@ -194,6 +199,10 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
if (thresholds.missingTargets != null) {
|
||||
thresholds.missingTargets.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -203,24 +212,21 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
* @param refBase the reference allele
|
||||
*/
|
||||
private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
|
||||
// output empty statistics for uncovered intervals
|
||||
while (interval != null && interval.isBefore(refLocus)) {
|
||||
final IntervalStratification stats = intervalMap.get(interval);
|
||||
outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE);
|
||||
if (stats != null) intervalMap.remove(interval);
|
||||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
// remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals)
|
||||
// output any intervals that were finished
|
||||
final List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>();
|
||||
for (GenomeLoc key : intervalMap.keySet()) {
|
||||
if (key.isBefore(refLocus)) {
|
||||
outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true));
|
||||
intervalMap.remove(key);
|
||||
final IntervalStratification intervalStats = intervalMap.get(key);
|
||||
outputStatsToVCF(intervalStats, Allele.create(refBase, true));
|
||||
if (hasMissingLoci(intervalStats)) {
|
||||
outputMissingInterval(intervalStats);
|
||||
}
|
||||
toRemove.add(key);
|
||||
}
|
||||
}
|
||||
for (GenomeLoc key : toRemove) {
|
||||
intervalMap.remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -228,7 +234,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
*
|
||||
* @param refLocus the current reference locus
|
||||
*/
|
||||
private void addNewOverlappingIntervals(GenomeLoc refLocus) {
|
||||
private void addNewOverlappingIntervals(final GenomeLoc refLocus) {
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
while (interval != null && !interval.isPast(refLocus)) {
|
||||
intervalMap.put(interval, createIntervalStatistic(interval));
|
||||
|
|
@ -243,14 +249,24 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
* @param stats The statistics of the interval
|
||||
* @param refAllele the reference allele
|
||||
*/
|
||||
private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) {
|
||||
private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
|
||||
GenomeLoc interval = stats.getInterval();
|
||||
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
final Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
for (String sample : samples) {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sample);
|
||||
|
||||
SampleStratification sampleStat = stats.getSampleStatistics(sample);
|
||||
gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
|
||||
gb.attribute(LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci());
|
||||
gb.attribute(ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci());
|
||||
gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
|
||||
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
alleles.add(refAllele);
|
||||
alleles.add(SYMBOLIC_ALLELE);
|
||||
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles);
|
||||
|
|
@ -262,21 +278,56 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
|
||||
|
||||
vcb = vcb.attributes(attributes);
|
||||
for (String sample : samples) {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sample);
|
||||
|
||||
SampleStratification sampleStat = stats.getSampleStatistics(sample);
|
||||
gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
|
||||
|
||||
gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
|
||||
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
vcb = vcb.genotypes(genotypes);
|
||||
|
||||
vcfWriter.add(vcb.make());
|
||||
}
|
||||
|
||||
private boolean hasMissingStatuses(AbstractStratification stats) {
|
||||
return !stats.callableStatuses().isEmpty();
|
||||
}
|
||||
|
||||
private boolean hasMissingLoci(final IntervalStratification stats) {
|
||||
return thresholds.missingTargets != null && hasMissingStatuses(stats);
|
||||
}
|
||||
|
||||
private void outputMissingInterval(final IntervalStratification stats) {
|
||||
final GenomeLoc interval = stats.getInterval();
|
||||
final boolean missing[] = new boolean[interval.size()];
|
||||
Arrays.fill(missing, true);
|
||||
for (AbstractStratification sample : stats.getElements()) {
|
||||
if (hasMissingStatuses(sample)) {
|
||||
int pos = 0;
|
||||
for (AbstractStratification locus : sample.getElements()) {
|
||||
if (locus.callableStatuses().isEmpty()) {
|
||||
missing[pos] = false;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
int start = -1;
|
||||
boolean insideMissing = false;
|
||||
for (int i = 0; i < missing.length; i++) {
|
||||
if (missing[i] && !insideMissing) {
|
||||
start = interval.getStart() + i;
|
||||
insideMissing = true;
|
||||
} else if (!missing[i] && insideMissing) {
|
||||
final int stop = interval.getStart() + i - 1;
|
||||
outputMissingInterval(interval.getContig(), start, stop);
|
||||
insideMissing = false;
|
||||
}
|
||||
}
|
||||
if (insideMissing) {
|
||||
outputMissingInterval(interval.getContig(), start, interval.getStop());
|
||||
}
|
||||
}
|
||||
|
||||
private void outputMissingInterval(final String contig, final int start, final int stop) {
|
||||
final PrintStream out = thresholds.missingTargets;
|
||||
out.println(String.format("%s:%d-%d", contig, start, stop));
|
||||
}
|
||||
|
||||
/**
|
||||
* Function that process a set of statuses into strings
|
||||
*
|
||||
|
|
@ -345,6 +396,8 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
// FORMAT fields for each genotype
|
||||
headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
|
||||
headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size."));
|
||||
headerLines.add(new VCFFormatHeaderLine(LOW_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with low coverage (below the minimum coverage) but not zero."));
|
||||
headerLines.add(new VCFFormatHeaderLine(ZERO_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with zero coverage."));
|
||||
|
||||
// FILTER fields
|
||||
for (CallableStatus stat : CallableStatus.values())
|
||||
|
|
|
|||
|
|
@ -56,11 +56,11 @@ import java.util.*;
|
|||
final class IntervalStratification extends AbstractStratification {
|
||||
private final Map<String, AbstractStratification> samples;
|
||||
private final GenomeLoc interval;
|
||||
private final ThresHolder thresholds;
|
||||
private List<CallableStatus> callableStatuses;
|
||||
|
||||
public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
this.interval = interval;
|
||||
this.thresholds = thresholds;
|
||||
this.samples = new HashMap<String, AbstractStratification>(samples.size());
|
||||
for (String sample : samples)
|
||||
this.samples.put(sample, new SampleStratification(interval, thresholds));
|
||||
|
|
@ -114,7 +114,13 @@ final class IntervalStratification extends AbstractStratification {
|
|||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<CallableStatus> callableStatuses() {
|
||||
public List<CallableStatus> callableStatuses() {
|
||||
if (callableStatuses == null)
|
||||
callableStatuses = calculateStatus();
|
||||
return callableStatuses;
|
||||
}
|
||||
|
||||
private List<CallableStatus> calculateStatus() {
|
||||
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
|
||||
// check if any of the votes pass the threshold
|
||||
|
|
@ -125,7 +131,7 @@ final class IntervalStratification extends AbstractStratification {
|
|||
}
|
||||
}
|
||||
|
||||
output.addAll(queryStatus(thresholds.intervalMetricList, this));
|
||||
output.addAll(queryStatus(thresholds.intervalMetricList));
|
||||
|
||||
return output;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,22 +46,20 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
final class LocusStratification extends AbstractStratification {
|
||||
private long coverage;
|
||||
private long rawCoverage;
|
||||
private final List<Metric> locusStatisticsList;
|
||||
|
||||
public LocusStratification(ThresHolder thresholds) {
|
||||
this(0,0,thresholds);
|
||||
}
|
||||
|
||||
protected LocusStratification(int coverage, int rawCoverage, ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
this.coverage = coverage;
|
||||
this.rawCoverage = rawCoverage;
|
||||
this.locusStatisticsList = thresholds.locusMetricList;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -79,14 +77,7 @@ final class LocusStratification extends AbstractStratification {
|
|||
* @return a set of all statuses that apply
|
||||
*/
|
||||
public List<CallableStatus> callableStatuses() {
|
||||
List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
for (Metric stats : locusStatisticsList) {
|
||||
CallableStatus status = stats.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
return output;
|
||||
return queryStatus(thresholds.locusMetricList);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -58,6 +58,6 @@ final class PluginUtils {
|
|||
final Map<CallableStatus, Integer> totals = sampleStratification.getStatusTally();
|
||||
final int size = sampleStratification.getIntervalSize();
|
||||
final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0;
|
||||
return ( (double) statusCount / size) >= threshold ? CALL: null;
|
||||
return ( (double) statusCount / size) > threshold ? CALL: null;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,15 +61,14 @@ import java.util.List;
|
|||
final class SampleStratification extends AbstractStratification {
|
||||
private final GenomeLoc interval;
|
||||
private final ArrayList<AbstractStratification> loci;
|
||||
private final ThresHolder thresholds;
|
||||
|
||||
private int nReads = -1;
|
||||
private int nBadMates = -1;
|
||||
|
||||
public SampleStratification(final GenomeLoc interval, final ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
this.interval = interval;
|
||||
this.loci = new ArrayList<AbstractStratification>(interval.size());
|
||||
this.thresholds = thresholds;
|
||||
nReads = 0;
|
||||
nBadMates = 0;
|
||||
|
||||
|
|
@ -118,10 +117,10 @@ final class SampleStratification extends AbstractStratification {
|
|||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<CallableStatus> callableStatuses() {
|
||||
public List<CallableStatus> callableStatuses() {
|
||||
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
|
||||
// get the tally of all the locus callable statuses
|
||||
// get the sample statuses of all the Loci Metrics
|
||||
for (Metric locusStat : thresholds.locusMetricList) {
|
||||
final CallableStatus status = ((LocusMetric) locusStat).sampleStatus(this);
|
||||
if (status != null) {
|
||||
|
|
@ -130,12 +129,7 @@ final class SampleStratification extends AbstractStratification {
|
|||
}
|
||||
|
||||
// get the sample specific statitics statuses
|
||||
for (Metric sampleStat : thresholds.sampleMetricList) {
|
||||
final CallableStatus status = sampleStat.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
output.addAll(queryStatus(thresholds.sampleMetricList));
|
||||
|
||||
// special case, if there are no reads, then there is no sense reporting coverage gaps.
|
||||
if (output.contains(CallableStatus.NO_READS) && output.contains(CallableStatus.COVERAGE_GAPS))
|
||||
|
|
@ -159,4 +153,17 @@ final class SampleStratification extends AbstractStratification {
|
|||
read.setTemporaryAttribute("seen", true);
|
||||
}
|
||||
}
|
||||
|
||||
public int getNLowCoveredLoci() {
|
||||
return getCallableStatusCount(CallableStatus.LOW_COVERAGE);
|
||||
}
|
||||
|
||||
public int getNUncoveredLoci() {
|
||||
return getCallableStatusCount(CallableStatus.COVERAGE_GAPS);
|
||||
}
|
||||
|
||||
private int getCallableStatusCount(CallableStatus status) {
|
||||
final Integer x = getStatusTally().get(status);
|
||||
return x == null ? 0 : x;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,7 +47,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -114,6 +116,9 @@ final class ThresHolder {
|
|||
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false)
|
||||
public double qualityStatusThreshold = 0.50;
|
||||
|
||||
@Output(fullName = "missing_intervals", shortName = "missing", defaultToStdout = false, doc ="Produces a file with the intervals that don't pass filters", required = false)
|
||||
public PrintStream missingTargets = null;
|
||||
|
||||
public final List<Metric> locusMetricList = new LinkedList<Metric>();
|
||||
public final List<Metric> sampleMetricList = new LinkedList<Metric>();
|
||||
public final List<Metric> intervalMetricList = new LinkedList<Metric>();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,110 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.missing;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
* <p/>
|
||||
* <p>
|
||||
* [Long description of the walker]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <p/>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* [Description of the Input]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Description of the Output]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T [walker name]
|
||||
* </pre>
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 5/1/13
|
||||
*/
|
||||
final class Metrics {
|
||||
private double gccontent;
|
||||
private double baseQual;
|
||||
private double mapQual;
|
||||
private int reads;
|
||||
private int refs;
|
||||
|
||||
void reads(int reads) {this.reads = reads;}
|
||||
void refs(int refs) {this.refs = refs;}
|
||||
|
||||
void gccontent(double gccontent) {this.gccontent = gccontent;}
|
||||
void baseQual(double baseQual) {this.baseQual = baseQual;}
|
||||
void mapQual(double mapQual) {this.mapQual = mapQual;}
|
||||
|
||||
double gccontent() {return refs > 0 ? gccontent/refs : 0.0;}
|
||||
double baseQual() {return reads > 0 ? baseQual/reads : 0.0;}
|
||||
double mapQual() {return reads > 0 ? mapQual/reads : 0.0;}
|
||||
|
||||
/**
|
||||
* Combines two metrics
|
||||
*
|
||||
* @param value the other metric to combine
|
||||
* @return itself, for simple reduce
|
||||
*/
|
||||
public Metrics combine(Metrics value) {
|
||||
this.gccontent += value.gccontent;
|
||||
this.baseQual += value.baseQual;
|
||||
this.mapQual += value.mapQual;
|
||||
this.reads += value.reads;
|
||||
this.refs += value.refs;
|
||||
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,228 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.missing;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.NanoSchedulable;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Walks along reference and calculates a few metrics for each interval.
|
||||
*
|
||||
* Metrics:
|
||||
* <ul>
|
||||
* <li>Average Base Quality</li>
|
||||
* <li>Average Mapping Quality</li>
|
||||
* <li>GC Content</li>
|
||||
* <li>Position in the target</li>
|
||||
* <li>Coding Sequence / Intron</li>
|
||||
* <li>Length of the uncovered area</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* A reference file (for GC content), the input bam file (for base and mapping quality calculation), the missing intervals (in the -L), the baits/targets used to sequence (in the -targets) and a bed file with the coding sequence intervals of the genome (in the -cds)
|
||||
* </p>
|
||||
*
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* GC content calculations per interval.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Example</h3>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -T QualifyMissingIntervals \
|
||||
* -R ref.fasta \
|
||||
* -I input.bam \
|
||||
* -o output.grp \
|
||||
* -L input.intervals \
|
||||
* -cds cds.intervals \
|
||||
* -targets targets.intervals
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
|
||||
@By(DataSource.REFERENCE)
|
||||
public final class QualifyMissingIntervals extends LocusWalker<Metrics, Metrics> implements NanoSchedulable {
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
@Argument(shortName = "targets", required = true)
|
||||
public File targetsFile;
|
||||
|
||||
@Argument(shortName = "cds", required = false)
|
||||
public File cdsFile = null;
|
||||
|
||||
GATKReport simpleReport;
|
||||
GenomeLocSortedSet target;
|
||||
GenomeLocSortedSet cds;
|
||||
|
||||
public boolean isReduceByInterval() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void initialize() {
|
||||
simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "TP", "CD", "LN");
|
||||
final GenomeLocParser parser = getToolkit().getGenomeLocParser();
|
||||
target = new GenomeLocSortedSet(parser);
|
||||
cds = new GenomeLocSortedSet(parser);
|
||||
parseFile(targetsFile, target, parser);
|
||||
if (cdsFile != null)
|
||||
parseFile(cdsFile, cds, parser);
|
||||
}
|
||||
|
||||
public Metrics reduceInit() {
|
||||
return new Metrics();
|
||||
}
|
||||
|
||||
public Metrics map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (tracker == null)
|
||||
return null;
|
||||
|
||||
final Metrics metrics = new Metrics();
|
||||
final byte baseIndex = ref.getBase();
|
||||
final ReadBackedPileup pileup = context.getBasePileup();
|
||||
final int nBases = pileup.getNumberOfElements();
|
||||
|
||||
double baseQual = 0.0;
|
||||
for (byte qual : pileup.getQuals()) {
|
||||
baseQual += qual;
|
||||
}
|
||||
double mapQual = 0.0;
|
||||
for (byte qual : pileup.getMappingQuals()) {
|
||||
mapQual += qual;
|
||||
}
|
||||
|
||||
metrics.baseQual(baseQual);
|
||||
metrics.mapQual(mapQual);
|
||||
metrics.gccontent(baseIndex == 'C' || baseIndex == 'G' ? 1.0 : 0.0);
|
||||
metrics.reads(nBases);
|
||||
metrics.refs(1);
|
||||
|
||||
return metrics;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Metrics reduce(Metrics value, Metrics sum) {
|
||||
return sum.combine(value);
|
||||
}
|
||||
|
||||
public void onTraversalDone(List<Pair<GenomeLoc, Metrics>> results) {
|
||||
for (Pair<GenomeLoc, Metrics> r : results) {
|
||||
GenomeLoc interval = r.getFirst();
|
||||
Metrics metrics = r.getSecond();
|
||||
simpleReport.addRow(
|
||||
interval.toString(),
|
||||
metrics.gccontent(),
|
||||
metrics.baseQual(),
|
||||
metrics.mapQual(),
|
||||
getPositionInTarget(interval),
|
||||
cds.overlaps(interval),
|
||||
interval.size()
|
||||
);
|
||||
}
|
||||
simpleReport.print(out);
|
||||
out.close();
|
||||
}
|
||||
|
||||
private static GenomeLoc parseInterval(String s, GenomeLocParser parser) {
|
||||
if (s.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
String[] first = s.split(":");
|
||||
if (first.length == 2) {
|
||||
String[] second = first[1].split("\\-");
|
||||
return parser.createGenomeLoc(first[0], Integer.decode(second[0]), Integer.decode(second[1]));
|
||||
} else {
|
||||
throw new UserException.BadInput("Interval doesn't parse correctly: " + s);
|
||||
}
|
||||
}
|
||||
|
||||
private void parseFile(File file, GenomeLocSortedSet set, GenomeLocParser parser) {
|
||||
try {
|
||||
for (String s : new XReadLines(file) ) {
|
||||
GenomeLoc interval = parseInterval(s, parser);
|
||||
if (interval != null)
|
||||
set.add(interval, true);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private int getPositionInTarget(GenomeLoc interval) {
|
||||
final List<GenomeLoc> hits = target.getOverlapping(interval);
|
||||
int result = 0;
|
||||
for (GenomeLoc hit : hits) {
|
||||
result = interval.getStart() - hit.getStart(); // if there are multiple hits, we'll get the last one.
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -76,7 +76,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
private List<Allele> alleleList = new ArrayList<Allele>();
|
||||
|
||||
|
||||
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
protected IndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC,
|
||||
final Logger logger) {
|
||||
super(UAC, logger);
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
|
||||
|
|
@ -85,10 +86,11 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
|
||||
}
|
||||
|
||||
protected static List<Allele> computeConsensusAlleles(ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenomeLocParser locParser, UnifiedArgumentCollection UAC) {
|
||||
protected static List<Allele> computeConsensusAlleles(final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final UnifiedArgumentCollection UAC) {
|
||||
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
|
||||
return counter.computeConsensusAlleles(ref, contexts, contextType);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -147,9 +147,17 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
// if we only want variants, then we don't need to calculate genotype likelihoods
|
||||
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
|
||||
return builder.make();
|
||||
// if user requires all PLs at all sites, add all possible alt alleles
|
||||
else if (UAC.annotateAllSitesWithPLs) {
|
||||
for ( final byte base : BaseUtils.BASES ) {
|
||||
if ( base != refBase )
|
||||
alleles.add(Allele.create(base));
|
||||
}
|
||||
}
|
||||
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
|
||||
else
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,9 @@ import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
|||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class UnifiedArgumentCollection extends StandardCallerArgumentCollection {
|
||||
|
||||
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
|
||||
|
|
@ -82,7 +85,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
* The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
|
||||
*/
|
||||
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false)
|
||||
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL;
|
||||
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
|
||||
|
||||
/**
|
||||
* The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base
|
||||
|
|
@ -95,6 +98,18 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
/**
|
||||
* Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
|
||||
* This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
|
||||
* WARNINGS:
|
||||
* - This feature will inflate VCF file size considerably.
|
||||
* - All SNP ALT alleles will be emitted with corresponding 10 PL values.
|
||||
* - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)
|
||||
public boolean annotateAllSitesWithPLs = false;
|
||||
|
||||
// indel-related arguments
|
||||
/**
|
||||
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
|
||||
|
|
@ -199,6 +214,9 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
|
||||
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
|
||||
|
||||
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
|
||||
public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
|
||||
|
||||
/**
|
||||
* Create a new UAC with defaults for all UAC arguments
|
||||
*/
|
||||
|
|
@ -247,7 +265,9 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO;
|
||||
this.pairHMM = uac.pairHMM;
|
||||
this.OutputMode = uac.OutputMode;
|
||||
|
||||
this.annotateAllSitesWithPLs = uac.annotateAllSitesWithPLs;
|
||||
// todo- arguments to remove
|
||||
this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,6 +83,9 @@ public class UnifiedGenotyperEngine {
|
|||
public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
|
||||
public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
|
||||
|
||||
private static final int SNP_MODEL = 0;
|
||||
private static final int INDEL_MODEL = 1;
|
||||
|
||||
public enum OUTPUT_MODE {
|
||||
/** produces calls only at variant sites */
|
||||
EMIT_VARIANTS_ONLY,
|
||||
|
|
@ -165,6 +168,13 @@ public class UnifiedGenotyperEngine {
|
|||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
|
||||
determineGLModelsToUse();
|
||||
|
||||
// do argument checking
|
||||
if (UAC.annotateAllSitesWithPLs) {
|
||||
if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP))
|
||||
throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -436,7 +446,8 @@ public class UnifiedGenotyperEngine {
|
|||
bestGuessIsRef = false;
|
||||
}
|
||||
// if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
|
||||
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ||
|
||||
UAC.annotateAllSitesWithPLs) {
|
||||
myAlleles.add(alternateAllele);
|
||||
alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
|
||||
}
|
||||
|
|
@ -446,7 +457,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
|
||||
final double phredScaledConfidence =
|
||||
Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs
|
||||
? -10 * AFresult.getLog10PosteriorOfAFEq0()
|
||||
: -10 * AFresult.getLog10PosteriorOfAFGT0());
|
||||
|
||||
|
|
@ -540,11 +551,6 @@ public class UnifiedGenotyperEngine {
|
|||
builder.attributes(attributes);
|
||||
VariantContext vcCall = builder.make();
|
||||
|
||||
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
|
||||
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
|
||||
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
|
||||
vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall);
|
||||
|
||||
if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine
|
||||
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
||||
final ReadBackedPileup pileup = rawContext.getBasePileup();
|
||||
|
|
@ -553,6 +559,11 @@ public class UnifiedGenotyperEngine {
|
|||
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
|
||||
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
|
||||
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
|
||||
vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall);
|
||||
|
||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
|
||||
}
|
||||
|
||||
|
|
@ -693,13 +704,13 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
private void determineGLModelsToUse() {
|
||||
|
||||
String modelPrefix = "";
|
||||
if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY )
|
||||
modelPrefix = GPSTRING;
|
||||
|
||||
if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) {
|
||||
modelPrefix += UAC.GLmodel.name().toUpperCase().replaceAll("BOTH","");
|
||||
// GGA mode => must initialize both the SNP and indel models
|
||||
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ||
|
||||
UAC.GLmodel.name().toUpperCase().contains("BOTH") ) {
|
||||
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP"));
|
||||
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL"));
|
||||
}
|
||||
|
|
@ -712,31 +723,24 @@ public class UnifiedGenotyperEngine {
|
|||
private List<GenotypeLikelihoodsCalculationModel.Model> getGLModelsToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext refContext,
|
||||
final AlignmentContext rawContext) {
|
||||
|
||||
if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES )
|
||||
return modelsToUse;
|
||||
|
||||
if ( modelsToUse.size() != 2 )
|
||||
throw new IllegalStateException("GGA mode assumes that we have initialized both the SNP and indel models but found " + modelsToUse);
|
||||
|
||||
// if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested
|
||||
final List<GenotypeLikelihoodsCalculationModel.Model> GGAmodel = new ArrayList<GenotypeLikelihoodsCalculationModel.Model>(1);
|
||||
final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if ( vcInput == null )
|
||||
return GGAmodel; // no work to be done
|
||||
|
||||
if ( vcInput.isSNP() ) {
|
||||
// use the SNP model unless the user chose INDEL mode only
|
||||
if ( modelsToUse.size() == 2 || modelsToUse.get(0).name().endsWith("SNP") )
|
||||
GGAmodel.add(modelsToUse.get(0));
|
||||
if ( vcInput == null ) {
|
||||
return Collections.emptyList(); // no work to be done
|
||||
} else if ( vcInput.isSNP() ) {
|
||||
return Collections.singletonList(modelsToUse.get(SNP_MODEL));
|
||||
} else if ( vcInput.isIndel() || vcInput.isMixed() ) {
|
||||
return Collections.singletonList(modelsToUse.get(INDEL_MODEL));
|
||||
} else {
|
||||
return Collections.emptyList(); // No support for other types yet
|
||||
}
|
||||
else if ( vcInput.isIndel() || vcInput.isMixed() ) {
|
||||
// use the INDEL model unless the user chose SNP mode only
|
||||
if ( modelsToUse.size() == 2 )
|
||||
GGAmodel.add(modelsToUse.get(1));
|
||||
else if ( modelsToUse.get(0).name().endsWith("INDEL") )
|
||||
GGAmodel.add(modelsToUse.get(0));
|
||||
}
|
||||
// No support for other types yet
|
||||
|
||||
return GGAmodel;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -106,7 +106,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles()));
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
|
||||
builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
|
|
@ -352,6 +352,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy) {
|
||||
return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
|
||||
return allelesToUse.size() == 1
|
||||
? GATKVariantContextUtils.subsetToRefOnly(vc, ploidy)
|
||||
: GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse,
|
||||
assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,89 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
|
||||
|
||||
/**
|
||||
* Result of assembling, with the resulting graph and status
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 7/1/13
|
||||
* Time: 5:35 PM
|
||||
*/
|
||||
public class AssemblyResult {
|
||||
private final Status status;
|
||||
private final SeqGraph graph;
|
||||
|
||||
/**
|
||||
* Create a new assembly result
|
||||
* @param status the status, cannot be null
|
||||
* @param graph the resulting graph of the assembly, can only be null if result is failed
|
||||
*/
|
||||
public AssemblyResult(final Status status, final SeqGraph graph) {
|
||||
if ( status == null ) throw new IllegalArgumentException("status cannot be null");
|
||||
if ( status != Status.FAILED && graph == null ) throw new IllegalArgumentException("graph is null but status is " + status);
|
||||
|
||||
this.status = status;
|
||||
this.graph = graph;
|
||||
}
|
||||
|
||||
public Status getStatus() { return status; }
|
||||
public SeqGraph getGraph() { return graph; }
|
||||
|
||||
/**
|
||||
* Status of the assembly result
|
||||
*/
|
||||
public enum Status {
|
||||
/** Something went wrong, and we couldn't produce a meaningful graph */
|
||||
FAILED,
|
||||
/** Assembly succeeded, but graph degenerated into just the reference sequence */
|
||||
JUST_ASSEMBLED_REFERENCE,
|
||||
/** Assembly succeeded, and the graph has some meaningful structure */
|
||||
ASSEMBLED_SOME_VARIATION
|
||||
}
|
||||
}
|
||||
|
|
@ -49,7 +49,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
|
|
@ -77,6 +76,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
private final static int NUM_PATHS_PER_GRAPH = 25;
|
||||
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
|
||||
private static final int GRAPH_KMER_STEP = 6;
|
||||
private static final int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
|
||||
|
||||
private final int minKmer;
|
||||
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
|
||||
|
|
@ -92,8 +92,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<SeqGraph> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype) {
|
||||
final List<SeqGraph> graphs = new LinkedList<SeqGraph>();
|
||||
protected List<AssemblyResult> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
|
||||
final List<AssemblyResult> results = new LinkedList<>();
|
||||
|
||||
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
|
||||
if( maxKmer < minKmer) {
|
||||
|
|
@ -106,7 +106,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
continue;
|
||||
|
||||
if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
|
||||
DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype);
|
||||
DeBruijnGraph graph = createGraphFromSequences(reads, kmer, refHaplotype, activeAlleleHaplotypes);
|
||||
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
|
||||
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
|
||||
|
|
@ -117,23 +117,18 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
" future subsystem will actually go and error correct the reads");
|
||||
}
|
||||
|
||||
final SeqGraph seqGraph = cleanupSeqGraph(graph.convertToSequenceGraph());
|
||||
results.add(cleanupSeqGraph(graph.convertToSequenceGraph()));
|
||||
|
||||
if ( seqGraph != null ) { // if the graph contains interesting variation from the reference
|
||||
graphs.add(seqGraph);
|
||||
|
||||
if ( debugGraphTransformations ) // we only want to use one graph size
|
||||
break;
|
||||
}
|
||||
if ( debugGraphTransformations ) // we only want to use one graph size
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return graphs;
|
||||
return results;
|
||||
}
|
||||
|
||||
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
|
||||
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype ) {
|
||||
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes ) {
|
||||
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
|
||||
final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph);
|
||||
|
||||
|
|
@ -142,6 +137,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// something went wrong, so abort right now with a null graph
|
||||
return null;
|
||||
|
||||
// add the artificial GGA haplotypes to the graph
|
||||
if ( ! addGGAKmersToGraph(builder, activeAlleleHaplotypes) )
|
||||
// something went wrong, so abort right now with a null graph
|
||||
return null;
|
||||
|
||||
// now go through the graph already seeded with the reference sequence and add the read kmers to it
|
||||
if ( ! addReadKmersToGraph(builder, reads) )
|
||||
// some problem was detected adding the reads to the graph, return null to indicate we failed
|
||||
|
|
@ -151,6 +151,28 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
return graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the high-quality kmers from the artificial GGA haplotypes to the graph
|
||||
*
|
||||
* @param builder a debruijn graph builder to add the read kmers to
|
||||
* @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
|
||||
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
|
||||
*/
|
||||
protected boolean addGGAKmersToGraph(final DeBruijnGraphBuilder builder, final List<Haplotype> activeAlleleHaplotypes) {
|
||||
|
||||
final int kmerLength = builder.getKmerSize();
|
||||
|
||||
for( final Haplotype haplotype : activeAlleleHaplotypes ) {
|
||||
final int end = haplotype.length() - kmerLength;
|
||||
for( int start = 0; start < end; start++ ) {
|
||||
builder.addKmerPairFromSeqToGraph( haplotype.getBases(), start, GGA_MODE_ARTIFICIAL_COUNTS );
|
||||
}
|
||||
}
|
||||
|
||||
// always returns true now, but it's possible that we'd add kmers and decide we don't like the graph in some way
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the high-quality kmers from the reads to the graph
|
||||
*
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
|
|
@ -71,7 +72,7 @@ public class GenotypingEngine {
|
|||
|
||||
private final boolean DEBUG;
|
||||
private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS;
|
||||
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final static List<Allele> noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final VariantAnnotatorEngine annotationEngine;
|
||||
private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
|
||||
|
||||
|
|
@ -146,6 +147,7 @@ public class GenotypingEngine {
|
|||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final RefMetaDataTracker tracker,
|
||||
final List<VariantContext> activeAllelesToGenotype ) {
|
||||
// sanity check input arguments
|
||||
if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
|
||||
|
|
@ -162,8 +164,8 @@ public class GenotypingEngine {
|
|||
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype);
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
final Set<Haplotype> calledHaplotypes = new HashSet<Haplotype>();
|
||||
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
|
||||
final Set<Haplotype> calledHaplotypes = new HashSet<>();
|
||||
final List<VariantContext> returnCalls = new ArrayList<>();
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
|
||||
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
|
||||
|
|
@ -183,7 +185,7 @@ public class GenotypingEngine {
|
|||
if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) {
|
||||
throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles.");
|
||||
}
|
||||
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<VariantContext, Allele>();
|
||||
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
|
||||
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
|
||||
for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) {
|
||||
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
|
||||
|
|
@ -204,13 +206,12 @@ public class GenotypingEngine {
|
|||
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
|
||||
|
||||
VariantContext annotatedCall = call;
|
||||
if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call);
|
||||
|
||||
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
|
||||
}
|
||||
|
||||
annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, annotatedCall);
|
||||
|
||||
// maintain the set of all called haplotypes
|
||||
for ( final Allele calledAllele : call.getAlleles() )
|
||||
calledHaplotypes.addAll(alleleMapper.get(calledAllele));
|
||||
|
|
@ -244,7 +245,7 @@ public class GenotypingEngine {
|
|||
|
||||
if ( in_GGA_mode ) startPosKeySet.clear();
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
//cleanUpSymbolicUnassembledEvents( haplotypes ); // We don't make symbolic alleles so this isn't needed currently
|
||||
if ( !in_GGA_mode ) {
|
||||
// run the event merger if we're not in GGA mode
|
||||
final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc);
|
||||
|
|
@ -267,7 +268,7 @@ public class GenotypingEngine {
|
|||
* @return the list of the sources of vcs in the same order
|
||||
*/
|
||||
private List<String> makePriorityList(final List<VariantContext> vcs) {
|
||||
final List<String> priorityList = new LinkedList<String>();
|
||||
final List<String> priorityList = new LinkedList<>();
|
||||
for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource());
|
||||
return priorityList;
|
||||
}
|
||||
|
|
@ -276,7 +277,7 @@ public class GenotypingEngine {
|
|||
final int loc,
|
||||
final List<VariantContext> activeAllelesToGenotype) {
|
||||
// the overlapping events to merge into a common reference view
|
||||
final List<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
final List<VariantContext> eventsAtThisLoc = new ArrayList<>();
|
||||
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
|
|
@ -292,7 +293,7 @@ public class GenotypingEngine {
|
|||
if( compVC.getStart() == loc ) {
|
||||
int alleleCount = 0;
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
List<Allele> alleleSet = new ArrayList<Allele>(2);
|
||||
List<Allele> alleleSet = new ArrayList<>(2);
|
||||
alleleSet.add(compVC.getReference());
|
||||
alleleSet.add(compAltAllele);
|
||||
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
|
||||
|
|
@ -348,7 +349,7 @@ public class GenotypingEngine {
|
|||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final VariantContext call ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new LinkedHashMap<>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call);
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perSampleReadMap.entrySet() ) {
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
|
|
@ -384,7 +385,7 @@ public class GenotypingEngine {
|
|||
// TODO - split into input haplotypes and output haplotypes as not to share I/O arguments
|
||||
@Requires("haplotypes != null")
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
|
||||
final List<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
final List<Haplotype> haplotypesToRemove = new ArrayList<>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().getVariantContexts() ) {
|
||||
if( vc.isSymbolic() ) {
|
||||
|
|
@ -407,7 +408,7 @@ public class GenotypingEngine {
|
|||
final Map<Allele, List<Haplotype>> alleleMapper,
|
||||
final double downsamplingFraction ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<>();
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
for( final Map.Entry<Allele, List<Haplotype>> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele
|
||||
|
|
@ -430,7 +431,7 @@ public class GenotypingEngine {
|
|||
}
|
||||
|
||||
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final Map<VariantContext, Allele> mergeMap, final Map<Event, List<Haplotype>> eventMap ) {
|
||||
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<Allele, List<Haplotype>>();
|
||||
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<>();
|
||||
for( final Map.Entry<VariantContext, Allele> entry : mergeMap.entrySet() ) {
|
||||
alleleMapper.put(entry.getValue(), eventMap.get(new Event(entry.getKey())));
|
||||
}
|
||||
|
|
@ -441,100 +442,33 @@ public class GenotypingEngine {
|
|||
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
|
||||
protected static Map<Event, List<Haplotype>> createEventMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
|
||||
|
||||
final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<Event, List<Haplotype>>(eventsAtThisLoc.size()+1);
|
||||
VariantContext refVC = eventsAtThisLoc.get(0); // the genome loc is the only safe thing to pull out of this VC because ref/alt pairs might change reference basis
|
||||
eventMapper.put(new Event(null), new ArrayList<Haplotype>());
|
||||
final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1);
|
||||
final Event refEvent = new Event(null);
|
||||
eventMapper.put(refEvent, new ArrayList<Haplotype>());
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
eventMapper.put(new Event(vc), new ArrayList<Haplotype>());
|
||||
}
|
||||
|
||||
final List<Haplotype> undeterminedHaplotypes = new ArrayList<Haplotype>(haplotypes.size());
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() ) {
|
||||
final List<Allele> alleles = new ArrayList<Allele>(2);
|
||||
alleles.add(h.getArtificialRefAllele());
|
||||
alleles.add(h.getArtificialAltAllele());
|
||||
final Event artificialVC = new Event( (new VariantContextBuilder()).source("artificialHaplotype")
|
||||
.alleles(alleles)
|
||||
.loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + h.getArtificialRefAllele().length() - 1).make() );
|
||||
if( eventMapper.containsKey(artificialVC) ) {
|
||||
eventMapper.get(artificialVC).add(h);
|
||||
}
|
||||
} else if( h.getEventMap().get(loc) == null ) { // no event at this location so let's investigate later
|
||||
undeterminedHaplotypes.add(h);
|
||||
if( h.getEventMap().get(loc) == null ) {
|
||||
eventMapper.get(refEvent).add(h);
|
||||
} else {
|
||||
boolean haplotypeIsDetermined = false;
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
eventMapper.get(new Event(vcAtThisLoc)).add(h);
|
||||
haplotypeIsDetermined = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( !haplotypeIsDetermined )
|
||||
undeterminedHaplotypes.add(h);
|
||||
}
|
||||
}
|
||||
|
||||
for( final Haplotype h : undeterminedHaplotypes ) {
|
||||
Event matchingEvent = new Event(null);
|
||||
for( final Map.Entry<Event, List<Haplotype>> eventToTest : eventMapper.entrySet() ) {
|
||||
// don't test against the reference allele
|
||||
if( eventToTest.getKey().equals(new Event(null)) )
|
||||
continue;
|
||||
|
||||
// only try to disambiguate for alleles that have had haplotypes previously assigned above
|
||||
if( eventToTest.getValue().isEmpty() )
|
||||
continue;
|
||||
|
||||
final Haplotype artificialHaplotype = eventToTest.getValue().get(0);
|
||||
if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
|
||||
matchingEvent = eventToTest.getKey();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
eventMapper.get(matchingEvent).add(h);
|
||||
}
|
||||
|
||||
return eventMapper;
|
||||
}
|
||||
|
||||
protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset, final boolean resolveSupersetToSubset) {
|
||||
|
||||
for ( final Map.Entry<Integer, VariantContext> fromSubset : subset.entrySet() ) {
|
||||
final VariantContext fromSuperset = superset.get(fromSubset.getKey());
|
||||
if ( fromSuperset == null )
|
||||
return false;
|
||||
|
||||
List<Allele> supersetAlleles = fromSuperset.getAlternateAlleles();
|
||||
if ( resolveSupersetToSubset )
|
||||
supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles);
|
||||
|
||||
if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static List<Allele> resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List<Allele> currentAlleles) {
|
||||
if ( targetReference.length() <= actualReference.length() )
|
||||
return currentAlleles;
|
||||
|
||||
final List<Allele> newAlleles = new ArrayList<Allele>(currentAlleles.size());
|
||||
final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length());
|
||||
for ( final Allele a : currentAlleles ) {
|
||||
newAlleles.add(Allele.extend(a, extraBases));
|
||||
}
|
||||
return newAlleles;
|
||||
}
|
||||
|
||||
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
|
||||
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final List<List<Haplotype>> alleleMapper, final List<Haplotype> haplotypes ) {
|
||||
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
|
||||
final List<Allele> eventAllelesForSample = new ArrayList<Allele>();
|
||||
final List<Allele> eventAllelesForSample = new ArrayList<>();
|
||||
for( final Allele a : haplotypeAllelesForSample ) {
|
||||
final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a));
|
||||
for( int iii = 0; iii < alleleMapper.size(); iii++ ) {
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
|
|
@ -56,7 +57,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -76,16 +77,13 @@ import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
|
|||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.gvcf.GVCFWriter;
|
||||
import org.broadinstitute.sting.utils.haplotype.*;
|
||||
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
|
|
@ -219,7 +217,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>(Arrays.asList(new String[]{"ClippingRankSumTest"}));
|
||||
protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"}));
|
||||
|
||||
/**
|
||||
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
||||
|
|
@ -262,6 +260,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
@Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
|
||||
protected List<Integer> kmerSizes = Arrays.asList(10, 25);
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false)
|
||||
protected boolean dontIncreaseKmerSizesForCycles = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="The number of samples that must pass the minPuning factor in order for the path to be kept", required = false)
|
||||
protected int numPruningSamples = 1;
|
||||
|
||||
/**
|
||||
* Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype
|
||||
* considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the
|
||||
|
|
@ -287,6 +293,61 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// general advanced arguments to control haplotype caller behavior
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="emitRefConfidence", shortName="ERC", doc="Emit experimental reference confidence scores", required = false)
|
||||
protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE;
|
||||
|
||||
public enum ReferenceConfidenceMode {
|
||||
NONE,
|
||||
BP_RESOLUTION,
|
||||
GVCF
|
||||
}
|
||||
|
||||
/**
|
||||
* The GQ partition intervals
|
||||
*
|
||||
* Should be a non-empty list of boundaries. For example, suppose this variable is
|
||||
*
|
||||
* [A, B, C]
|
||||
*
|
||||
* We would partition our hom-ref sites into the following bands:
|
||||
*
|
||||
* X < A
|
||||
* A <= X < B
|
||||
* B <= X < C
|
||||
* X >= C
|
||||
*
|
||||
* The default bands give the following GQ blocks:
|
||||
*
|
||||
* [0, 0]
|
||||
* (0, 10]
|
||||
* (10, 20]
|
||||
* (20, 30]
|
||||
* (30, 40]
|
||||
* (40, 50]
|
||||
* (50, 99]
|
||||
*
|
||||
* Note that in the GATK GQ values are capped at 99.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false)
|
||||
protected List<Integer> GVCFGQBands = Arrays.asList(1, 10, 20, 30, 40, 50);
|
||||
|
||||
/**
|
||||
* This parameter determines the maximum size of an indel considered as potentially segregating in the
|
||||
* reference model. It is used to eliminate reads from being indel informative at a site, and determines
|
||||
* by that mechanism the certainty in the reference base. Conceptually, setting this parameter to
|
||||
* X means that each informative read is consistent with any indel of size < X being present at a specific
|
||||
* position in the genome, given its alignment to the reference.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="indelSizeToEliminateInRefModel", shortName="ERCIS", doc="The size of an indel to check for in the reference model", required = false)
|
||||
protected int indelSizeToEliminateInRefModel = 10;
|
||||
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
// general advanced arguments to control haplotype caller behavior
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
|
||||
protected int MIN_PRUNE_FACTOR = 2;
|
||||
|
|
@ -328,7 +389,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false)
|
||||
protected int phredScaledGlobalReadMismappingRate = 60;
|
||||
protected int phredScaledGlobalReadMismappingRate = 45;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
|
||||
|
|
@ -384,10 +445,28 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
|
||||
protected boolean dontUseSoftClippedBases = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false)
|
||||
protected boolean captureAssemblyFailureBAM = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false)
|
||||
protected boolean allowCyclesInKmerGraphToGeneratePaths = false;
|
||||
|
||||
// Parameters to control read error correction
|
||||
@Hidden
|
||||
@Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
|
||||
protected boolean errorCorrectReads = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="kmerLengthForReadErrorCorrection", shortName="kmerLengthForReadErrorCorrection", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
|
||||
protected int kmerLengthForReadErrorCorrection = 25;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false)
|
||||
protected int minObservationsForKmerToBeSolid = 20;
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
// done with Haplotype caller parameters
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
|
|
@ -418,7 +497,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
private final static int PADDING_AROUND_OTHERS_FOR_CALLING = 150;
|
||||
|
||||
// the maximum extent into the full active region extension that we're willing to go in genotyping our events
|
||||
private final static int MAX_GENOTYPING_ACTIVE_REGION_EXTENSION = 25;
|
||||
private final static int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25;
|
||||
private final static int MAX_GGA_ACTIVE_REGION_EXTENSION = 100;
|
||||
|
||||
private ActiveRegionTrimmer trimmer = null;
|
||||
|
||||
|
|
@ -428,17 +508,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// bases with quality less than or equal to this value are trimmed off the tails of the reads
|
||||
private static final byte MIN_TAIL_QUALITY = 20;
|
||||
|
||||
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
|
||||
// the minimum length of a read we'd consider using for genotyping
|
||||
private final static int MIN_READ_LENGTH = 10;
|
||||
|
||||
private List<String> samplesList = new ArrayList<String>();
|
||||
private final static double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private final static double LOG_ONE_THIRD = -Math.log10(3.0);
|
||||
private final List<VariantContext> allelesToGenotype = new ArrayList<VariantContext>();
|
||||
private List<String> samplesList = new ArrayList<>();
|
||||
|
||||
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
|
||||
private final static Allele FAKE_ALT_ALLELE = Allele.create("<FAKE_ALT>", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
|
||||
|
||||
ReferenceConfidenceModel referenceConfidenceModel = null;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
|
|
@ -457,6 +537,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
final int nSamples = samples.size();
|
||||
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
|
||||
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
|
||||
// HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this?
|
||||
UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
|
||||
|
|
@ -501,6 +584,19 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// where the filters are used. For example, in emitting all sites the lowQual field is used
|
||||
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
|
||||
|
||||
referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel);
|
||||
if ( emitReferenceConfidence() ) {
|
||||
if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently");
|
||||
headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines());
|
||||
if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
|
||||
try {
|
||||
vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands);
|
||||
} catch ( IllegalArgumentException e ) {
|
||||
throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vcfWriter.writeHeader(new VCFHeader(headerInfo, samples));
|
||||
|
||||
try {
|
||||
|
|
@ -514,7 +610,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH);
|
||||
assemblyEngine = useDebruijnAssembler
|
||||
? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler)
|
||||
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes);
|
||||
: new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples);
|
||||
|
||||
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
|
||||
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
|
||||
|
|
@ -545,11 +641,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger );
|
||||
|
||||
if ( bamWriter != null )
|
||||
if ( bamWriter != null ) {
|
||||
// we currently do not support multi-threaded BAM writing, so exception out
|
||||
if ( getToolkit().getTotalNumberOfThreads() > 1 )
|
||||
throw new UserException.BadArgumentValue("bamout", "Currently cannot emit a BAM file from the HaplotypeCaller in multi-threaded mode.");
|
||||
haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader());
|
||||
}
|
||||
|
||||
trimmer = new ActiveRegionTrimmer(DEBUG, PADDING_AROUND_SNPS_FOR_CALLING, PADDING_AROUND_OTHERS_FOR_CALLING,
|
||||
MAX_GENOTYPING_ACTIVE_REGION_EXTENSION, getToolkit().getGenomeLocParser());
|
||||
UAC.GenotypingMode.equals(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ? MAX_GGA_ACTIVE_REGION_EXTENSION : MAX_DISCOVERY_ACTIVE_REGION_EXTENSION,
|
||||
getToolkit().getGenomeLocParser());
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -566,7 +667,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
@Override
|
||||
public EnumSet<ActiveRegionReadState> desiredReadStates() {
|
||||
if ( includeUnmappedReads ) {
|
||||
throw new UserException.BadArgumentValue("includeUmappedReads", "is not yet functional");
|
||||
throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional");
|
||||
// return EnumSet.of(
|
||||
// ActiveRegionReadState.PRIMARY,
|
||||
// ActiveRegionReadState.NONPRIMARY,
|
||||
|
|
@ -588,7 +689,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles);
|
||||
if( vcFromAllelesRod != null ) {
|
||||
allelesToGenotype.add(vcFromAllelesRod); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object
|
||||
return new ActivityProfileState(ref.getLocus(), 1.0);
|
||||
}
|
||||
}
|
||||
|
|
@ -601,38 +701,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// if we don't have any data, just abort early
|
||||
return new ActivityProfileState(ref.getLocus(), 0.0);
|
||||
|
||||
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
final List<Allele> noCall = Collections.singletonList(Allele.NO_CALL); // used to noCall all genotypes until the exact model is applied
|
||||
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
|
||||
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
|
||||
final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage();
|
||||
for( final Map.Entry<String, AlignmentContext> sample : splitContexts.entrySet() ) {
|
||||
final double[] genotypeLikelihoods = new double[3]; // ref versus non-ref (any event)
|
||||
Arrays.fill(genotypeLikelihoods, 0.0);
|
||||
|
||||
for( final PileupElement p : sample.getValue().getBasePileup() ) {
|
||||
final byte qual = p.getQual();
|
||||
if( p.isDeletion() || qual > (byte) 18) {
|
||||
int AA = 0; final int AB = 1; int BB = 2;
|
||||
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
|
||||
AA = 2;
|
||||
BB = 0;
|
||||
if( p.isNextToSoftClip() ) {
|
||||
averageHQSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
|
||||
}
|
||||
}
|
||||
genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
|
||||
genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
|
||||
genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
|
||||
}
|
||||
}
|
||||
final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), (byte) 18, averageHQSoftClips).genotypeLikelihoods;
|
||||
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add( FAKE_REF_ALLELE );
|
||||
alleles.add( FAKE_ALT_ALLELE );
|
||||
final List<Allele> alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE);
|
||||
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
|
||||
|
||||
|
|
@ -652,63 +730,84 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
|
||||
return NO_CALLS;
|
||||
|
||||
if( !originalActiveRegion.isActive() ) { return NO_CALLS; } // Not active so nothing to do!
|
||||
if( !originalActiveRegion.isActive() ) {
|
||||
// Not active so nothing to do!
|
||||
return referenceModelForNoVariation(originalActiveRegion, true);
|
||||
}
|
||||
|
||||
final List<VariantContext> activeAllelesToGenotype = new ArrayList<>();
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
for( final VariantContext vc : allelesToGenotype ) {
|
||||
if( originalActiveRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) {
|
||||
for ( final VariantContext vc : metaDataTracker.getValues(UG_engine.getUAC().alleles) ) {
|
||||
if ( vc.isNotFiltered() ) {
|
||||
activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode
|
||||
}
|
||||
}
|
||||
allelesToGenotype.removeAll( activeAllelesToGenotype );
|
||||
// No alleles found in this region so nothing to do!
|
||||
if ( activeAllelesToGenotype.isEmpty() ) { return NO_CALLS; }
|
||||
if ( activeAllelesToGenotype.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); }
|
||||
} else {
|
||||
if( originalActiveRegion.size() == 0 ) { return NO_CALLS; } // No reads here so nothing to do!
|
||||
// No reads here so nothing to do!
|
||||
if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); }
|
||||
}
|
||||
|
||||
// run the local assembler, getting back a collection of information on how we should proceed
|
||||
final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
|
||||
|
||||
// abort early if something is out of the acceptable range
|
||||
if( ! assemblyResult.isVariationPresent() ) { return NO_CALLS; } // only the reference haplotype remains so nothing else to do!
|
||||
if( ! assemblyResult.isVariationPresent() ) {
|
||||
return referenceModelForNoVariation(originalActiveRegion, false);
|
||||
} // only the reference haplotype remains so nothing else to do!
|
||||
|
||||
if (dontGenotype) return NO_CALLS; // user requested we not proceed
|
||||
|
||||
// filter out reads from genotyping which fail mapping quality based criteria
|
||||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
|
||||
final Collection<GATKSAMRecord> filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
||||
if( assemblyResult.regionForGenotyping.size() == 0 ) { return NO_CALLS; } // no reads remain after filtering so nothing else to do!
|
||||
if( assemblyResult.regionForGenotyping.size() == 0 ) {
|
||||
// no reads remain after filtering so nothing else to do!
|
||||
return referenceModelForNoVariation(originalActiveRegion, false);
|
||||
}
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) );
|
||||
|
||||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
||||
final List<Haplotype> bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap);
|
||||
// Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there
|
||||
// was a bad interaction between that selection and the marginalization that happens over each event when computing
|
||||
// GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the
|
||||
// haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included
|
||||
// in the genotyping, but we lose information if we select down to a few haplotypes. [EB]
|
||||
|
||||
final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine,
|
||||
bestHaplotypes,
|
||||
assemblyResult.haplotypes,
|
||||
stratifiedReadMap,
|
||||
perSampleFilteredReadList,
|
||||
assemblyResult.fullReferenceWithPadding,
|
||||
assemblyResult.paddedReferenceLoc,
|
||||
assemblyResult.regionForGenotyping.getLocation(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
metaDataTracker,
|
||||
activeAllelesToGenotype );
|
||||
|
||||
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
|
||||
if ( bamWriter != null ) {
|
||||
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
|
||||
bestHaplotypes,
|
||||
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(
|
||||
assemblyResult.haplotypes,
|
||||
assemblyResult.paddedReferenceLoc,
|
||||
assemblyResult.haplotypes,
|
||||
calledHaplotypes.getCalledHaplotypes(),
|
||||
stratifiedReadMap);
|
||||
}
|
||||
|
||||
if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
|
||||
|
||||
return calledHaplotypes.getCalls();
|
||||
if ( emitReferenceConfidence() ) {
|
||||
return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getRefHaplotype(),
|
||||
calledHaplotypes.getCalledHaplotypes(), assemblyResult.paddedReferenceLoc, assemblyResult.regionForGenotyping,
|
||||
stratifiedReadMap, calledHaplotypes.getCalls());
|
||||
} else {
|
||||
return calledHaplotypes.getCalls();
|
||||
}
|
||||
}
|
||||
|
||||
private final static class AssemblyResult {
|
||||
|
|
@ -717,6 +816,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
final byte[] fullReferenceWithPadding;
|
||||
final GenomeLoc paddedReferenceLoc;
|
||||
final boolean variationPresent;
|
||||
final Haplotype refHaplotype;
|
||||
|
||||
private AssemblyResult(List<Haplotype> haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) {
|
||||
this.haplotypes = haplotypes;
|
||||
|
|
@ -724,6 +824,21 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
this.fullReferenceWithPadding = fullReferenceWithPadding;
|
||||
this.paddedReferenceLoc = paddedReferenceLoc;
|
||||
this.variationPresent = variationPresent;
|
||||
|
||||
Haplotype firstRefHaplotype = null;
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
if ( h.isReference() ) {
|
||||
if ( firstRefHaplotype != null ) throw new IllegalArgumentException("Found two haplotypes marked as reference " + firstRefHaplotype + " and " + h);
|
||||
firstRefHaplotype = h;
|
||||
}
|
||||
}
|
||||
|
||||
if ( firstRefHaplotype == null ) throw new IllegalArgumentException("Couldn't find a reference haplotype in " + haplotypes);
|
||||
this.refHaplotype = firstRefHaplotype;
|
||||
}
|
||||
|
||||
public Haplotype getRefHaplotype() {
|
||||
return refHaplotype;
|
||||
}
|
||||
|
||||
public boolean isVariationPresent() {
|
||||
|
|
@ -744,25 +859,99 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// Create the reference haplotype which is the bases from the reference that make up the active region
|
||||
finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails
|
||||
|
||||
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true);
|
||||
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
|
||||
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
|
||||
final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc);
|
||||
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
|
||||
// Create ReadErrorCorrector object if requested - will be used within assembly engine.
|
||||
ReadErrorCorrector readErrorCorrector = null;
|
||||
if (errorCorrectReads)
|
||||
readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG,fullReferenceWithPadding);
|
||||
|
||||
if ( ! dontTrimActiveRegions ) {
|
||||
return trimActiveRegion(activeRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
} else {
|
||||
// we don't want to trim active regions, so go ahead and use the old one
|
||||
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
|
||||
try {
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector );
|
||||
if ( ! emitReferenceConfidence() && ! dontTrimActiveRegions ) {
|
||||
return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
} else {
|
||||
// we don't want to trim active regions, so go ahead and use the old one
|
||||
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
|
||||
}
|
||||
} catch ( Exception e ) {
|
||||
// Capture any exception that might be thrown, and write out the assembly failure BAM if requested
|
||||
if ( captureAssemblyFailureBAM ) {
|
||||
final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5);
|
||||
for ( final GATKSAMRecord read : activeRegion.getReads() ) {
|
||||
writer.addAlignment(read);
|
||||
}
|
||||
writer.close();
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to create the reference haplotype out of the active region and a padded loc
|
||||
* @param activeRegion the active region from which to generate the reference haplotype
|
||||
* @param paddedReferenceLoc the GenomeLoc which includes padding and shows how big the reference haplotype should be
|
||||
* @return a non-null haplotype
|
||||
*/
|
||||
private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) {
|
||||
return ReferenceConfidenceModel.createReferenceHaplotype(activeRegion, activeRegion.getActiveRegionReference(referenceReader), paddedReferenceLoc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an ref model result (ref model or no calls depending on mode) for an active region without any variation
|
||||
* (not is active, or assembled to just ref)
|
||||
*
|
||||
* @param region the region to return a no-variation result
|
||||
* @param needsToBeFinalized should the region be finalized before computing the ref model (should be false if already done)
|
||||
* @return a list of variant contexts (can be empty) to emit for this ref region
|
||||
*/
|
||||
private List<VariantContext> referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) {
|
||||
if ( emitReferenceConfidence() ) {
|
||||
if ( needsToBeFinalized ) finalizeActiveRegion(region);
|
||||
filterNonPassingReads(region); // TODO -- remove when filtering is done in finalizeActiveRegion
|
||||
final GenomeLoc paddedLoc = region.getExtendedLoc();
|
||||
final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc);
|
||||
final List<Haplotype> haplotypes = Collections.singletonList(refHaplotype);
|
||||
return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes,
|
||||
paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region),
|
||||
Collections.<VariantContext>emptyList());
|
||||
} else {
|
||||
return NO_CALLS;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a context that maps each read to the reference haplotype with log10 L of 0
|
||||
* @param refHaplotype a non-null reference haplotype
|
||||
* @param samples a list of all samples
|
||||
* @param region the active region containing reads
|
||||
* @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref
|
||||
*/
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> createDummyStratifiedReadMap(final Haplotype refHaplotype,
|
||||
final List<String> samples,
|
||||
final ActiveRegion region) {
|
||||
final Allele refAllele = Allele.create(refHaplotype, true);
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> map = new LinkedHashMap<>(1);
|
||||
for ( final Map.Entry<String, List<GATKSAMRecord>> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) {
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
for ( final GATKSAMRecord read : entry.getValue() ) {
|
||||
likelihoodMap.add(read, refAllele, 0.0);
|
||||
}
|
||||
map.put(entry.getKey(), likelihoodMap);
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* Trim down the active region to just enough to properly genotype the events among the haplotypes
|
||||
*
|
||||
* @param originalActiveRegion our full active region
|
||||
* @param haplotypes the list of haplotypes we've created from assembly
|
||||
* @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty)
|
||||
* @param fullReferenceWithPadding the reference bases over the full padded location
|
||||
* @param paddedReferenceLoc the span of the reference bases
|
||||
* @return an AssemblyResult containing the trimmed active region with all of the reads we should use
|
||||
|
|
@ -771,12 +960,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
*/
|
||||
private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion,
|
||||
final List<Haplotype> haplotypes,
|
||||
final List<VariantContext> activeAllelesToGenotype,
|
||||
final byte[] fullReferenceWithPadding,
|
||||
final GenomeLoc paddedReferenceLoc) {
|
||||
if ( DEBUG ) logger.info("Trimming active region " + originalActiveRegion + " with " + haplotypes.size() + " haplotypes");
|
||||
|
||||
EventMap.buildEventMapsForHaplotypes(haplotypes, fullReferenceWithPadding, paddedReferenceLoc, DEBUG);
|
||||
final TreeSet<VariantContext> allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypes);
|
||||
allVariantsWithinFullActiveRegion.addAll(activeAllelesToGenotype);
|
||||
final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion);
|
||||
|
||||
if ( trimmedActiveRegion == null ) {
|
||||
|
|
@ -786,7 +977,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
|
||||
// trim down the haplotypes
|
||||
final Set<Haplotype> haplotypeSet = new HashSet<Haplotype>(haplotypes.size());
|
||||
final Set<Haplotype> haplotypeSet = new HashSet<>(haplotypes.size());
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
|
||||
if ( trimmed != null ) {
|
||||
|
|
@ -797,7 +988,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
|
||||
// create the final list of trimmed haplotypes
|
||||
final List<Haplotype> trimmedHaplotypes = new ArrayList<Haplotype>(haplotypeSet);
|
||||
final List<Haplotype> trimmedHaplotypes = new ArrayList<>(haplotypeSet);
|
||||
|
||||
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
|
||||
Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() );
|
||||
|
|
@ -811,7 +1002,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
|
||||
|
||||
// trim down the reads and add them to the trimmed active region
|
||||
final List<GATKSAMRecord> trimmedReads = new ArrayList<GATKSAMRecord>(originalActiveRegion.getReads().size());
|
||||
final List<GATKSAMRecord> trimmedReads = new ArrayList<>(originalActiveRegion.getReads().size());
|
||||
for( final GATKSAMRecord read : originalActiveRegion.getReads() ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() );
|
||||
if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
||||
|
|
@ -824,21 +1015,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Select the best N haplotypes according to their likelihoods, if appropriate
|
||||
*
|
||||
* @param haplotypes a list of haplotypes to consider
|
||||
* @param stratifiedReadMap a map from samples -> read likelihoods
|
||||
* @return the list of haplotypes to genotype
|
||||
*/
|
||||
protected List<Haplotype> selectBestHaplotypesForGenotyping(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
||||
if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
return haplotypes;
|
||||
} else {
|
||||
return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
|
|
@ -853,8 +1029,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
@Override
|
||||
public Integer reduce(List<VariantContext> callsInRegion, Integer numCalledRegions) {
|
||||
for( final VariantContext call : callsInRegion ) {
|
||||
// TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker.
|
||||
// annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call);
|
||||
vcfWriter.add( call );
|
||||
}
|
||||
return (callsInRegion.isEmpty() ? 0 : 1) + numCalledRegions;
|
||||
|
|
@ -862,6 +1036,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
|
||||
referenceConfidenceModel.close();
|
||||
likelihoodCalculationEngine.close();
|
||||
logger.info("Ran local assembly on " + result + " active regions");
|
||||
}
|
||||
|
||||
|
|
@ -873,35 +1050,32 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
|
||||
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
|
||||
if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
|
||||
final List<GATKSAMRecord> finalizedReadList = new ArrayList<>();
|
||||
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
|
||||
activeRegion.clearReads();
|
||||
|
||||
// Join overlapping paired reads to create a single longer read
|
||||
finalizedReadList.addAll( fragmentCollection.getSingletonReads() );
|
||||
for( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
|
||||
finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) );
|
||||
}
|
||||
|
||||
// Loop through the reads hard clipping the adaptor and low quality tails
|
||||
final List<GATKSAMRecord> readsToUse = new ArrayList<>(finalizedReadList.size());
|
||||
for( final GATKSAMRecord myRead : finalizedReadList ) {
|
||||
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
|
||||
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
|
||||
GATKSAMRecord clippedRead = useLowQualityBasesForAssembly ? postAdapterRead : ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
final List<GATKSAMRecord> readsToUse = new ArrayList<>(activeRegion.getReads().size());
|
||||
for( final GATKSAMRecord myRead : activeRegion.getReads() ) {
|
||||
GATKSAMRecord clippedRead;
|
||||
if (errorCorrectReads)
|
||||
clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION );
|
||||
else if (useLowQualityBasesForAssembly)
|
||||
clippedRead = myRead;
|
||||
else // default case: clip low qual ends of reads
|
||||
clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY );
|
||||
|
||||
if ( dontUseSoftClippedBases ) {
|
||||
// uncomment to remove hard clips from consideration at all
|
||||
clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
|
||||
} else {
|
||||
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
|
||||
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
|
||||
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
|
||||
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
|
||||
// TODO -- reference haplotype start must be removed
|
||||
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
|
||||
}
|
||||
if ( dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(clippedRead) ) {
|
||||
// remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all
|
||||
clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
|
||||
} else {
|
||||
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
|
||||
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
|
||||
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
|
||||
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
|
||||
// TODO -- reference haplotype start must be removed
|
||||
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
|
||||
}
|
||||
|
||||
clippedRead = ( clippedRead.getReadUnmappedFlag() ? clippedRead : ReadClipper.hardClipAdaptorSequence( clippedRead ) );
|
||||
if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) {
|
||||
clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() );
|
||||
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
||||
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());
|
||||
|
|
@ -910,20 +1084,18 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
}
|
||||
|
||||
activeRegion.clearReads();
|
||||
activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
final List<GATKSAMRecord> readsToRemove = new ArrayList<>();
|
||||
// logger.info("Filtering non-passing regions: n incoming " + activeRegion.getReads().size());
|
||||
private Set<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
final Set<GATKSAMRecord> readsToRemove = new LinkedHashSet<>();
|
||||
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
|
||||
if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||
readsToRemove.add(rec);
|
||||
// logger.info("\tremoving read " + rec + " len " + rec.getReadLength());
|
||||
}
|
||||
}
|
||||
activeRegion.removeAll( readsToRemove );
|
||||
// logger.info("Filtered non-passing regions: n remaining " + activeRegion.getReads().size());
|
||||
return readsToRemove;
|
||||
}
|
||||
|
||||
|
|
@ -933,8 +1105,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight);
|
||||
}
|
||||
|
||||
private Map<String, List<GATKSAMRecord>> splitReadsBySample( final List<GATKSAMRecord> reads ) {
|
||||
final Map<String, List<GATKSAMRecord>> returnMap = new HashMap<String, List<GATKSAMRecord>>();
|
||||
private Map<String, List<GATKSAMRecord>> splitReadsBySample( final Collection<GATKSAMRecord> reads ) {
|
||||
return splitReadsBySample(samplesList, reads);
|
||||
}
|
||||
|
||||
public static Map<String, List<GATKSAMRecord>> splitReadsBySample( final List<String> samplesList, final Collection<GATKSAMRecord> reads ) {
|
||||
final Map<String, List<GATKSAMRecord>> returnMap = new HashMap<>();
|
||||
for( final String sample : samplesList) {
|
||||
List<GATKSAMRecord> readList = returnMap.get( sample );
|
||||
if( readList == null ) {
|
||||
|
|
@ -950,4 +1126,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Are we emitting a reference confidence in some form, or not?
|
||||
* @return true if we are
|
||||
*/
|
||||
private boolean emitReferenceConfidence(){
|
||||
return emitReferenceConfidence != ReferenceConfidenceMode.NONE;
|
||||
}
|
||||
}
|
||||
|
|
@ -46,7 +46,11 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Fast wrapper for byte[] kmers
|
||||
|
|
@ -149,6 +153,15 @@ public class Kmer {
|
|||
return bases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Backdoor method for fast base peeking: avoids copying like bases() and doesn't modify internal state.
|
||||
* Intended to be used for fast computation of neighboring kmers
|
||||
* @return Reference to complete bases stores in this kmer
|
||||
* WARNING: UNSAFE, caller should NEVER modify bases. Speed/safety tradeoff!!
|
||||
*/
|
||||
private byte[] unsafePeekAtBases() {
|
||||
return bases;
|
||||
}
|
||||
/**
|
||||
* Get a string representation of the bases of this kmer
|
||||
* @return a non-null string
|
||||
|
|
@ -165,6 +178,45 @@ public class Kmer {
|
|||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a set of differing positions and bases from another k-mer, limiting up to a max distance.
|
||||
* For example, if this = "ACATT" and other = "ACGGT":
|
||||
* - if maxDistance < 2 then -1 will be returned, since distance between kmers is 2.
|
||||
* - If maxDistance >=2, then 2 will be returned, and arrays will be filled as follows:
|
||||
* differingIndeces = {2,3}
|
||||
* differingBases = {'G','G'}
|
||||
* @param other Other k-mer to test
|
||||
* @param maxDistance Maximum distance to search. If this and other k-mers are beyond this Hamming distance,
|
||||
* search is aborted and a null is returned
|
||||
* @param differingIndeces Array with indices of differing bytes in array
|
||||
* @param differingBases Actual differing bases
|
||||
* @return Set of mappings of form (int->byte), where each elements represents index
|
||||
* of k-mer array where bases mismatch, and the byte is the base from other kmer.
|
||||
* If both k-mers differ by more than maxDistance, returns null
|
||||
*/
|
||||
@Requires({"other != null","differingIndeces != null","differingBases != null",
|
||||
"differingIndeces.size>=maxDistance","differingBases.size>=maxDistance"})
|
||||
public int getDifferingPositions(final Kmer other,
|
||||
final int maxDistance,
|
||||
final int[] differingIndeces,
|
||||
final byte[] differingBases) {
|
||||
|
||||
|
||||
int dist = 0;
|
||||
if (length == other.length()) {
|
||||
final byte[] f2 = other.unsafePeekAtBases();
|
||||
for (int i=0; i < length; i++)
|
||||
if(bases[start+i] != f2[i]) {
|
||||
differingIndeces[dist] = i;
|
||||
differingBases[dist++] = f2[i];
|
||||
if (dist > maxDistance)
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
return dist;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Kmer{" + new String(bases()) + "}";
|
||||
|
|
|
|||
|
|
@ -48,29 +48,36 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
|
||||
import org.broadinstitute.sting.utils.pairhmm.*;
|
||||
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class LikelihoodCalculationEngine {
|
||||
private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
|
||||
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private final byte constantGCP;
|
||||
private final double log10globalReadMismappingRate;
|
||||
private final boolean DEBUG;
|
||||
|
||||
private final PairHMM.HMM_IMPLEMENTATION hmmType;
|
||||
|
||||
private final ThreadLocal<PairHMM> pairHMM = new ThreadLocal<PairHMM>() {
|
||||
|
|
@ -86,6 +93,10 @@ public class LikelihoodCalculationEngine {
|
|||
}
|
||||
};
|
||||
|
||||
private final static boolean WRITE_LIKELIHOODS_TO_FILE = false;
|
||||
private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
|
||||
private final PrintStream likelihoodsStream;
|
||||
|
||||
/**
|
||||
* The expected rate of random sequencing errors for a read originating from its true haplotype.
|
||||
*
|
||||
|
|
@ -113,12 +124,28 @@ public class LikelihoodCalculationEngine {
|
|||
this.constantGCP = constantGCP;
|
||||
this.DEBUG = debug;
|
||||
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
|
||||
|
||||
if ( WRITE_LIKELIHOODS_TO_FILE ) {
|
||||
try {
|
||||
likelihoodsStream = new PrintStream(new FileOutputStream(new File(LIKELIHOODS_FILENAME)));
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
likelihoodsStream = null;
|
||||
}
|
||||
}
|
||||
|
||||
public LikelihoodCalculationEngine() {
|
||||
this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if ( likelihoodsStream != null ) likelihoodsStream.close();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
|
||||
*
|
||||
|
|
@ -205,6 +232,17 @@ public class LikelihoodCalculationEngine {
|
|||
final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
|
||||
read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
|
||||
|
||||
if ( WRITE_LIKELIHOODS_TO_FILE ) {
|
||||
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
|
||||
haplotype.getBaseString(),
|
||||
new String(read.getReadBases()),
|
||||
SAMUtils.phredToFastq(readQuals),
|
||||
SAMUtils.phredToFastq(readInsQuals),
|
||||
SAMUtils.phredToFastq(readDelQuals),
|
||||
SAMUtils.phredToFastq(overallGCP),
|
||||
log10l);
|
||||
}
|
||||
|
||||
if ( haplotype.isNonReference() )
|
||||
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
|
||||
else
|
||||
|
|
@ -260,7 +298,7 @@ public class LikelihoodCalculationEngine {
|
|||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) *
|
||||
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + LOG_ONE_HALF );
|
||||
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
|
||||
|
|
@ -358,11 +396,11 @@ public class LikelihoodCalculationEngine {
|
|||
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
|
||||
|
||||
// all of the haplotypes that at least one sample called as one of the most likely
|
||||
final Set<Haplotype> selectedHaplotypes = new HashSet<Haplotype>();
|
||||
final Set<Haplotype> selectedHaplotypes = new HashSet<>();
|
||||
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
|
||||
|
||||
// our annoying map from allele -> haplotype
|
||||
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<Allele, Haplotype>();
|
||||
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
|
||||
allele2Haplotype.put(Allele.create(h, h.isReference()), h);
|
||||
|
|
|
|||
|
|
@ -51,17 +51,12 @@ import com.google.java.contract.Requires;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -78,6 +73,10 @@ import java.util.*;
|
|||
public abstract class LocalAssemblyEngine {
|
||||
private final static Logger logger = Logger.getLogger(LocalAssemblyEngine.class);
|
||||
|
||||
/**
|
||||
* If false, we will only write out a region around the reference source
|
||||
*/
|
||||
private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true;
|
||||
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
|
||||
private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30;
|
||||
|
||||
|
|
@ -111,7 +110,11 @@ public abstract class LocalAssemblyEngine {
|
|||
* @param refHaplotype the reference haplotype
|
||||
* @return a non-null list of reads
|
||||
*/
|
||||
protected abstract List<SeqGraph> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype);
|
||||
protected abstract List<AssemblyResult> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype, List<Haplotype> activeAlleleHaplotypes);
|
||||
|
||||
protected List<AssemblyResult> assemble(List<GATKSAMRecord> reads, Haplotype refHaplotype) {
|
||||
return assemble(reads, refHaplotype, Collections.<Haplotype>emptyList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
|
||||
|
|
@ -120,63 +123,102 @@ public abstract class LocalAssemblyEngine {
|
|||
* @param fullReferenceWithPadding byte array holding the reference sequence with padding
|
||||
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
|
||||
* @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
|
||||
* @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used.
|
||||
* @return a non-empty list of all the haplotypes that are produced during assembly
|
||||
*/
|
||||
public List<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List<VariantContext> activeAllelesToGenotype) {
|
||||
public List<Haplotype> runLocalAssembly(final ActiveRegion activeRegion,
|
||||
final Haplotype refHaplotype,
|
||||
final byte[] fullReferenceWithPadding,
|
||||
final GenomeLoc refLoc,
|
||||
final List<VariantContext> activeAllelesToGenotype,
|
||||
final ReadErrorCorrector readErrorCorrector) {
|
||||
if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
|
||||
if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
|
||||
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
|
||||
if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
|
||||
|
||||
// create the graphs by calling our subclass assemble method
|
||||
final List<SeqGraph> graphs = assemble(activeRegion.getReads(), refHaplotype);
|
||||
// create the list of artificial haplotypes that should be added to the graph for GGA mode
|
||||
final List<Haplotype> activeAlleleHaplotypes = createActiveAlleleHaplotypes(refHaplotype, activeAllelesToGenotype, activeRegion.getExtendedLoc());
|
||||
|
||||
// do some QC on the graphs
|
||||
for ( final SeqGraph graph : graphs ) { sanityCheckGraph(graph, refHaplotype); }
|
||||
// error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them
|
||||
final List<GATKSAMRecord> correctedReads;
|
||||
if (readErrorCorrector != null) {
|
||||
// now correct all reads in active region after filtering/downsampling
|
||||
// Note that original reads in active region are NOT modified by default, since they will be used later for GL computation,
|
||||
// and we only want the read-error corrected reads for graph building.
|
||||
readErrorCorrector.addReadsToKmers(activeRegion.getReads());
|
||||
correctedReads = new ArrayList<>(readErrorCorrector.correctReads(activeRegion.getReads()));
|
||||
} else {
|
||||
correctedReads = activeRegion.getReads();
|
||||
}
|
||||
|
||||
final List<SeqGraph> nonRefGraphs = new LinkedList<>();
|
||||
// create the graphs by calling our subclass assemble method
|
||||
for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, activeAlleleHaplotypes) ) {
|
||||
if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) {
|
||||
// do some QC on the graph
|
||||
sanityCheckGraph(result.getGraph(), refHaplotype);
|
||||
// add it to graphs with meaningful non-reference features
|
||||
nonRefGraphs.add(result.getGraph());
|
||||
}
|
||||
}
|
||||
|
||||
// print the graphs if the appropriate debug option has been turned on
|
||||
if ( graphWriter != null ) { printGraphs(graphs); }
|
||||
if ( graphWriter != null ) { printGraphs(nonRefGraphs); }
|
||||
|
||||
// find the best paths in the graphs and return them as haplotypes
|
||||
return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
|
||||
if ( nonRefGraphs.isEmpty() ) {
|
||||
// we couldn't assemble any meaningful graphs, so return just the reference haplotype
|
||||
return Collections.singletonList(refHaplotype);
|
||||
} else {
|
||||
// find the best paths in the graphs and return them as haplotypes
|
||||
return findBestPaths( nonRefGraphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() );
|
||||
}
|
||||
}
|
||||
|
||||
@Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
protected List<Haplotype> findBestPaths(final List<SeqGraph> graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow) {
|
||||
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
|
||||
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<Haplotype>();
|
||||
refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
|
||||
final Cigar c = new Cigar();
|
||||
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
|
||||
refHaplotype.setCigar(c);
|
||||
returnHaplotypes.add( refHaplotype );
|
||||
|
||||
/**
|
||||
* Create the list of artificial GGA-mode haplotypes by injecting each of the provided alternate alleles into the reference haplotype
|
||||
* @param refHaplotype the reference haplotype
|
||||
* @param activeAllelesToGenotype the list of alternate alleles in VariantContexts
|
||||
* @param activeRegionWindow the window containing the reference haplotype
|
||||
* @return a non-null list of haplotypes
|
||||
*/
|
||||
private List<Haplotype> createActiveAlleleHaplotypes(final Haplotype refHaplotype, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow) {
|
||||
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
|
||||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
|
||||
|
||||
// for GGA mode, add the desired allele into the haplotype
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
|
||||
addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true );
|
||||
if( insertedRefHaplotype != null ) { // can be null if the requested allele can't be inserted into the haplotype
|
||||
returnHaplotypes.add(insertedRefHaplotype);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new ArrayList<>(returnHaplotypes);
|
||||
}
|
||||
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
protected List<Haplotype> findBestPaths(final List<SeqGraph> graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow) {
|
||||
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
|
||||
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
|
||||
returnHaplotypes.add( refHaplotype );
|
||||
|
||||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
|
||||
for( final SeqGraph graph : graphs ) {
|
||||
final SeqVertex source = graph.getReferenceSourceVertex();
|
||||
final SeqVertex sink = graph.getReferenceSinkVertex();
|
||||
if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
|
||||
|
||||
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<SeqVertex,BaseEdge>(allowCyclesInKmerGraphToGeneratePaths);
|
||||
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<>(allowCyclesInKmerGraphToGeneratePaths);
|
||||
for ( final Path<SeqVertex,BaseEdge> path : pathFinder.getKBestPaths(graph, numBestHaplotypesPerGraph, source, sink) ) {
|
||||
// logger.info("Found path " + path);
|
||||
Haplotype h = new Haplotype( path.getBases() );
|
||||
if( !returnHaplotypes.contains(h) ) {
|
||||
final Cigar cigar = path.calculateCigar(refHaplotype.getBases());
|
||||
|
||||
if ( cigar == null ) {
|
||||
// couldn't produce a meaningful alignment of haplotype to reference, fail quitely
|
||||
// couldn't produce a meaningful alignment of haplotype to reference, fail quietly
|
||||
continue;
|
||||
} else if( cigar.isEmpty() ) {
|
||||
throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() +
|
||||
|
|
@ -196,26 +238,7 @@ public abstract class LocalAssemblyEngine {
|
|||
returnHaplotypes.add(h);
|
||||
|
||||
if ( debug )
|
||||
logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
|
||||
|
||||
// for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
if( !activeAllelesToGenotype.isEmpty() ) {
|
||||
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
|
||||
|
||||
// This if statement used to additionally have:
|
||||
// "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
|
||||
// but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
|
||||
// a haplotype that already contains a 1bp insertion (so practically it is reference but
|
||||
// falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
|
||||
if( vcOnHaplotype == null ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -238,7 +261,7 @@ public abstract class LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
return new ArrayList<Haplotype>(returnHaplotypes);
|
||||
return new ArrayList<>(returnHaplotypes);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -257,84 +280,25 @@ public abstract class LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
/**
|
||||
* Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype.
|
||||
* Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information.
|
||||
* This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based.
|
||||
* @param haplotype the candidate haplotype
|
||||
* @param ref the reference bases to align against
|
||||
* @param haplotypeList the current list of haplotypes
|
||||
* @param activeRegionStart the start of the active region in the reference byte array
|
||||
* @param activeRegionStop the stop of the active region in the reference byte array
|
||||
* @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists
|
||||
* @return true if the candidate haplotype was successfully incorporated into the haplotype list
|
||||
* Print graph to file if debugGraphTransformations is enabled
|
||||
* @param graph the graph to print
|
||||
* @param file the destination file
|
||||
*/
|
||||
@Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"})
|
||||
private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final Set<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
|
||||
if( haplotype == null ) { return false; }
|
||||
|
||||
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS );
|
||||
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
|
||||
|
||||
if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
|
||||
return false;
|
||||
}
|
||||
|
||||
haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) );
|
||||
|
||||
final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true);
|
||||
int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
|
||||
if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
|
||||
hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
|
||||
}
|
||||
byte[] newHaplotypeBases;
|
||||
// extend partial haplotypes to contain the full active region sequence
|
||||
if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
newHaplotypeBases = ArrayUtils.addAll(ArrayUtils.addAll(ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
|
||||
haplotype.getBases()),
|
||||
ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop));
|
||||
} else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
|
||||
} else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
|
||||
} else {
|
||||
newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
|
||||
}
|
||||
|
||||
final Haplotype h = new Haplotype( newHaplotypeBases );
|
||||
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS );
|
||||
|
||||
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
|
||||
if ( haplotype.isArtificialHaplotype() ) {
|
||||
h.setArtificialEvent(haplotype.getArtificialEvent());
|
||||
}
|
||||
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
|
||||
return false;
|
||||
}
|
||||
|
||||
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) );
|
||||
|
||||
if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) {
|
||||
haplotypeList.add(h);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
protected void printDebugGraphTransform(final BaseGraph graph, final File file) {
|
||||
if ( debugGraphTransformations ) {
|
||||
if ( PRINT_FULL_GRAPH_FOR_DEBUGGING )
|
||||
graph.printGraph(file, pruneFactor);
|
||||
else
|
||||
graph.subsetToRefSource().printGraph(file, pruneFactor);
|
||||
}
|
||||
}
|
||||
|
||||
protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) {
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
|
||||
protected AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) {
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot"));
|
||||
|
||||
// TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm
|
||||
// TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect
|
||||
// TODO -- to anything from one that's actually has good support along the chain but just happens
|
||||
// TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately
|
||||
// TODO -- the pruning algorithm really should be an error correction algorithm that knows more
|
||||
// TODO -- about the structure of the data and can differentiate between an infrequent path but
|
||||
// TODO -- without evidence against it (such as occurs when a region is hard to get any reads through)
|
||||
// TODO -- from a error with lots of weight going along another similar path
|
||||
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
|
||||
seqGraph.zipLinearChains();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot"));
|
||||
|
||||
// now go through and prune the graph, removing vertices no longer connected to the reference chain
|
||||
// IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
|
||||
|
|
@ -342,15 +306,15 @@ public abstract class LocalAssemblyEngine {
|
|||
seqGraph.pruneGraph(pruneFactor);
|
||||
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
|
||||
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot"));
|
||||
seqGraph.simplifyGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.4.merged.dot"));
|
||||
|
||||
// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
|
||||
// happen in cases where for example the reference somehow manages to acquire a cycle, or
|
||||
// where the entire assembly collapses back into the reference sequence.
|
||||
if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
|
||||
return null;
|
||||
return new AssemblyResult(AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE, seqGraph);
|
||||
|
||||
seqGraph.removePathsNotConnectedToRef();
|
||||
seqGraph.simplifyGraph();
|
||||
|
|
@ -363,16 +327,15 @@ public abstract class LocalAssemblyEngine {
|
|||
seqGraph.addVertex(dummy);
|
||||
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
|
||||
}
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot"));
|
||||
|
||||
return seqGraph;
|
||||
return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform general QC on the graph to make sure something hasn't gone wrong during assembly
|
||||
* @param graph the graph to check
|
||||
* @param refHaplotype the reference haplotype
|
||||
* @param <T>
|
||||
*/
|
||||
private <T extends BaseVertex, E extends BaseEdge> void sanityCheckGraph(final BaseGraph<T,E> graph, final Haplotype refHaplotype) {
|
||||
sanityCheckReferenceGraph(graph, refHaplotype);
|
||||
|
|
@ -383,7 +346,6 @@ public abstract class LocalAssemblyEngine {
|
|||
*
|
||||
* @param graph the graph to check
|
||||
* @param refHaplotype the reference haplotype
|
||||
* @param <T>
|
||||
*/
|
||||
private <T extends BaseVertex, E extends BaseEdge> void sanityCheckReferenceGraph(final BaseGraph<T,E> graph, final Haplotype refHaplotype) {
|
||||
if( graph.getReferenceSourceVertex() == null ) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,526 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Utility class that error-corrects reads.
|
||||
* Main idea: An error in a read will appear as a bubble in a k-mer (de Bruijn) graph and such bubble will have very low multiplicity.
|
||||
* Hence, read errors will appear as "sparse" kmers with very little support.
|
||||
* Historically, the most common approach to error-correct reads before assembly has been to first compute the kmer spectrum of the reads,
|
||||
* defined as the kmer composition of a set of reads along with the multiplicity of each kmer.
|
||||
* First-generation correctors like the Euler corrector (Pevzner 2001) mapped low frequency kmers (kmers appearing say below N times)
|
||||
* into high frequency ones that lied within a certain Hamming or edit distance.
|
||||
* This is doable, but has some drawbacks:
|
||||
* - Kmers used for error correction become tied to kmers used for graph building.
|
||||
* - Hence, large kmers (desireable for graph building because they can resolve repeats better) are a hindrance for error correction,
|
||||
* because they are seen less often.
|
||||
* - After error correction, there is no guarantee that a sequence of kmers corresponds to an "actual" read.
|
||||
*
|
||||
* An error-corrected set of reads also makes a much smoother graph without the need to resolving so many bubbles.
|
||||
*
|
||||
* Idea hence is to correct reads based on their kmer content, but in a context independent from graph building.
|
||||
* In order to do this, the following steps are taken:
|
||||
* - The k-mer spectrum of a set of reads in computed. However, we are at freedom to choose the most convenient k-mer size (typicially around
|
||||
* read length /2).
|
||||
* - We partition the set of observed k-mers into "solid" kmers which have multiplicity > M, and "insolid" ones otherwise (Pevzner 2001).
|
||||
*
|
||||
* - Main idea of the algorithm is to try to substitute a sequence of bases in a read by a sequence better supported by kmers.
|
||||
* - For each "unsolid" kmer observed in reads, we try to find a "solid" kmer within a maximum Hamming distance.
|
||||
* - If such solid kmer exists, then this unsolid kmer is "correctable", otherwise, uncorrectable.
|
||||
* - For each read, then:
|
||||
* -- Walk through read and visit all kmers.
|
||||
* -- If kmer is solid, continue to next kmer.
|
||||
* -- If not, and if it's correctable (i.e. there exists a mapping from an unsolid kmer to a solid kmer within a given Hamming distance),
|
||||
* add the bases and offsets corresponding to differing positions between unsolid and solid kmer to correction list.
|
||||
* -- At the end, each base in read will have a list of corrections associated with it. We can then choose to correct or not.
|
||||
* If read has only consistent corrections, then we can correct base to common base in corrections.
|
||||
*
|
||||
* TODO:
|
||||
* todo Q: WHAT QUALITY TO USE??
|
||||
* todo how do we deal with mate pairs?
|
||||
*
|
||||
*
|
||||
|
||||
|
||||
*/
|
||||
public class ReadErrorCorrector {
|
||||
private final static Logger logger = Logger.getLogger(ReadErrorCorrector.class);
|
||||
/**
|
||||
* A map of for each kmer to its num occurrences in addKmers
|
||||
*/
|
||||
KMerCounter countsByKMer;
|
||||
|
||||
Map<Kmer,Kmer> kmerCorrectionMap = new HashMap<>();
|
||||
Map<Kmer,Pair<int[],byte[]>> kmerDifferingBases = new HashMap<>();
|
||||
private final int kmerLength;
|
||||
private final boolean debug;
|
||||
private final boolean trimLowQualityBases;
|
||||
private final byte minTailQuality;
|
||||
private final int maxMismatchesToCorrect;
|
||||
private final byte qualityOfCorrectedBases;
|
||||
private final int maxObservationsForKmerToBeCorrectable;
|
||||
private final int maxHomopolymerLengthInRegion;
|
||||
private final int minObservationsForKmerToBeSolid;
|
||||
|
||||
// default values, for debugging
|
||||
private final static boolean doInplaceErrorCorrection = false; // currently not used, since we want corrected reads to be used only for assembly
|
||||
private final static int MAX_MISMATCHES_TO_CORRECT = 2;
|
||||
private final static byte QUALITY_OF_CORRECTED_BASES = 30; // what's a reasonable value here?
|
||||
private final static int MAX_OBSERVATIONS_FOR_KMER_TO_BE_CORRECTABLE = 1;
|
||||
private final static boolean TRIM_LOW_QUAL_TAILS = false;
|
||||
private final static boolean DONT_CORRECT_IN_LONG_HOMOPOLYMERS = false;
|
||||
private final static int MAX_HOMOPOLYMER_THRESHOLD = 12;
|
||||
|
||||
// debug counter structure
|
||||
private final ReadErrorCorrectionStats readErrorCorrectionStats = new ReadErrorCorrectionStats();
|
||||
|
||||
/**
|
||||
* Create a new kmer corrector
|
||||
*
|
||||
* @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1
|
||||
* @param maxMismatchesToCorrect e >= 0
|
||||
* @param qualityOfCorrectedBases Bases to be corrected will be assigned this quality
|
||||
*/
|
||||
public ReadErrorCorrector(final int kmerLength,
|
||||
final int maxMismatchesToCorrect,
|
||||
final int maxObservationsForKmerToBeCorrectable,
|
||||
final byte qualityOfCorrectedBases,
|
||||
final int minObservationsForKmerToBeSolid,
|
||||
final boolean trimLowQualityBases,
|
||||
final byte minTailQuality,
|
||||
final boolean debug,
|
||||
final byte[] fullReferenceWithPadding) {
|
||||
if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength);
|
||||
if ( maxMismatchesToCorrect < 1 )
|
||||
throw new IllegalArgumentException("maxMismatchesToCorrect must be >= 1 but got " + maxMismatchesToCorrect);
|
||||
if ( qualityOfCorrectedBases < 2 || qualityOfCorrectedBases > QualityUtils.MAX_REASONABLE_Q_SCORE)
|
||||
throw new IllegalArgumentException("qualityOfCorrectedBases must be >= 2 and <= MAX_REASONABLE_Q_SCORE but got " + qualityOfCorrectedBases);
|
||||
|
||||
countsByKMer = new KMerCounter(kmerLength);
|
||||
this.kmerLength = kmerLength;
|
||||
this.maxMismatchesToCorrect = maxMismatchesToCorrect;
|
||||
this.qualityOfCorrectedBases = qualityOfCorrectedBases;
|
||||
this.minObservationsForKmerToBeSolid = minObservationsForKmerToBeSolid;
|
||||
this.trimLowQualityBases = trimLowQualityBases;
|
||||
this.minTailQuality = minTailQuality;
|
||||
this.debug = debug;
|
||||
this.maxObservationsForKmerToBeCorrectable = maxObservationsForKmerToBeCorrectable;
|
||||
|
||||
// when region has long homopolymers, we may want not to correct reads, since assessment is complicated,
|
||||
// so we may decide to skip error correction in these regions
|
||||
maxHomopolymerLengthInRegion = computeMaxHLen(fullReferenceWithPadding);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple constructor with sensible defaults
|
||||
* @param kmerLength K-mer length for error correction (not necessarily the same as for assembly graph)
|
||||
* @param minTailQuality Minimum tail quality: remaining bases with Q's below this value are hard-clipped after correction
|
||||
* @param debug Output debug information
|
||||
*/
|
||||
public ReadErrorCorrector(final int kmerLength, final byte minTailQuality, final int minObservationsForKmerToBeSolid, final boolean debug,final byte[] fullReferenceWithPadding) {
|
||||
this(kmerLength, MAX_MISMATCHES_TO_CORRECT, MAX_OBSERVATIONS_FOR_KMER_TO_BE_CORRECTABLE, QUALITY_OF_CORRECTED_BASES, minObservationsForKmerToBeSolid, TRIM_LOW_QUAL_TAILS, minTailQuality, debug,fullReferenceWithPadding);
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry routine to add all kmers in a read to the read map counter
|
||||
* @param read Read to add bases
|
||||
*/
|
||||
@Requires("read != null")
|
||||
protected void addReadKmers(final GATKSAMRecord read) {
|
||||
if (DONT_CORRECT_IN_LONG_HOMOPOLYMERS && maxHomopolymerLengthInRegion > MAX_HOMOPOLYMER_THRESHOLD)
|
||||
return;
|
||||
|
||||
final byte[] readBases = read.getReadBases();
|
||||
for (int offset = 0; offset <= readBases.length-kmerLength; offset++ ) {
|
||||
countsByKMer.addKmer(new Kmer(readBases,offset,kmerLength),1);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Correct a collection of reads based on stored k-mer counts
|
||||
* @param reads
|
||||
*/
|
||||
public final List<GATKSAMRecord> correctReads(final Collection<GATKSAMRecord> reads) {
|
||||
|
||||
final List<GATKSAMRecord> correctedReads = new ArrayList<>(reads.size());
|
||||
if (DONT_CORRECT_IN_LONG_HOMOPOLYMERS && maxHomopolymerLengthInRegion > MAX_HOMOPOLYMER_THRESHOLD) {
|
||||
// just copy reads into output and exit
|
||||
correctedReads.addAll(reads);
|
||||
}
|
||||
else {
|
||||
computeKmerCorrectionMap();
|
||||
for (final GATKSAMRecord read: reads) {
|
||||
final GATKSAMRecord correctedRead = correctRead(read);
|
||||
if (trimLowQualityBases)
|
||||
correctedReads.add(ReadClipper.hardClipLowQualEnds(correctedRead, minTailQuality));
|
||||
else
|
||||
correctedReads.add(correctedRead);
|
||||
}
|
||||
if (debug) {
|
||||
logger.info("Number of corrected bases:"+readErrorCorrectionStats.numBasesCorrected);
|
||||
logger.info("Number of corrected reads:"+readErrorCorrectionStats.numReadsCorrected);
|
||||
logger.info("Number of skipped reads:"+readErrorCorrectionStats.numReadsUncorrected);
|
||||
logger.info("Number of solid kmers:"+readErrorCorrectionStats.numSolidKmers);
|
||||
logger.info("Number of corrected kmers:"+readErrorCorrectionStats.numCorrectedKmers);
|
||||
logger.info("Number of uncorrectable kmers:"+readErrorCorrectionStats.numUncorrectableKmers);
|
||||
}
|
||||
}
|
||||
return correctedReads;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Do actual read correction based on k-mer map. First, loop through stored k-mers to get a list of possible corrections
|
||||
* for each position in the read. Then correct read based on all possible consistent corrections.
|
||||
* @param inputRead Read to correct
|
||||
* @return Corrected read (can be same reference as input if doInplaceErrorCorrection is set)
|
||||
*/
|
||||
@Requires("inputRead != null")
|
||||
private GATKSAMRecord correctRead(final GATKSAMRecord inputRead) {
|
||||
// no support for reduced reads (which shouldn't need to be error-corrected anyway!)
|
||||
if (inputRead.isReducedRead())
|
||||
return inputRead;
|
||||
|
||||
// do actual correction
|
||||
boolean corrected = false;
|
||||
final byte[] correctedBases = inputRead.getReadBases();
|
||||
final byte[] correctedQuals = inputRead.getBaseQualities();
|
||||
|
||||
// array to store list of possible corrections for read
|
||||
final CorrectionSet correctionSet = buildCorrectionMap(correctedBases);
|
||||
|
||||
for (int offset = 0; offset < correctedBases.length; offset++) {
|
||||
final Byte b = correctionSet.getConsensusCorrection(offset);
|
||||
if (b != null && b != correctedBases[offset]) {
|
||||
correctedBases[offset] = b;
|
||||
correctedQuals[offset] = qualityOfCorrectedBases;
|
||||
corrected = true;
|
||||
}
|
||||
readErrorCorrectionStats.numBasesCorrected++;
|
||||
}
|
||||
|
||||
if (corrected) {
|
||||
readErrorCorrectionStats.numReadsCorrected++;
|
||||
if (doInplaceErrorCorrection) {
|
||||
inputRead.setReadBases(correctedBases);
|
||||
inputRead.setBaseQualities(correctedQuals);
|
||||
return inputRead;
|
||||
}
|
||||
else {
|
||||
GATKSAMRecord correctedRead = new GATKSAMRecord(inputRead);
|
||||
|
||||
// do the actual correction
|
||||
// todo - do we need to clone anything else from read?
|
||||
correctedRead.setBaseQualities(inputRead.getBaseQualities());
|
||||
correctedRead.setIsStrandless(inputRead.isStrandless());
|
||||
correctedRead.setReadBases(inputRead.getReadBases());
|
||||
correctedRead.setReadString(inputRead.getReadString());
|
||||
correctedRead.setReadGroup(inputRead.getReadGroup());
|
||||
return correctedRead;
|
||||
}
|
||||
}
|
||||
else {
|
||||
readErrorCorrectionStats.numReadsUncorrected++;
|
||||
return inputRead;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build correction map for each of the bases in read.
|
||||
* For each of the constituent kmers in read:
|
||||
* a) See whether the kmer has been mapped to a corrected kmer.
|
||||
* b) If so, get list of differing positions and corresponding bases.
|
||||
* c) Add then list of new bases to index in correction list.
|
||||
* Correction list is of read size, and holds a list of bases to correct.
|
||||
* @param correctedBases Bases to attempt to correct
|
||||
* @return CorrectionSet object.
|
||||
*/
|
||||
@Requires("correctedBases != null")
|
||||
private CorrectionSet buildCorrectionMap(final byte[] correctedBases) {
|
||||
// array to store list of possible corrections for read
|
||||
final CorrectionSet correctionSet = new CorrectionSet(correctedBases.length);
|
||||
|
||||
for (int offset = 0; offset <= correctedBases.length-kmerLength; offset++ ) {
|
||||
final Kmer kmer = new Kmer(correctedBases,offset,kmerLength);
|
||||
final Kmer newKmer = kmerCorrectionMap.get(kmer);
|
||||
if (newKmer != null && !newKmer.equals(kmer)){
|
||||
final Pair<int[],byte[]> differingPositions = kmerDifferingBases.get(kmer);
|
||||
final int[] differingIndeces = differingPositions.first;
|
||||
final byte[] differingBases = differingPositions.second;
|
||||
|
||||
for (int k=0; k < differingIndeces.length; k++) {
|
||||
// get list of differing positions for corrected kmer
|
||||
// for each of these, add correction candidate to correction set
|
||||
correctionSet.add(offset + differingIndeces[k],differingBases[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return correctionSet;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Top-level entry point that adds a collection of reads to our kmer list.
|
||||
* For each read in list, its constituent kmers will be logged in our kmer table.
|
||||
* @param reads
|
||||
*/
|
||||
@Requires("reads != null")
|
||||
public void addReadsToKmers(final Collection<GATKSAMRecord> reads) {
|
||||
for (final GATKSAMRecord read: reads)
|
||||
addReadKmers(read);
|
||||
|
||||
if (debug)
|
||||
for ( final KMerCounter.CountedKmer countedKmer: countsByKMer.getCountedKmers() )
|
||||
logger.info(String.format("%s\t%d\n", countedKmer.kmer, countedKmer.count));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* For each kmer we've seen, do the following:
|
||||
* a) If kmer count > threshold1, this kmer is good, so correction map will be to itself.
|
||||
* b) If kmer count <= threshold2, this kmer is bad.
|
||||
* In that case, loop through all other kmers. If kmer is good, compute distance, and get minimal distance.
|
||||
* If such distance is < some threshold, map to this kmer, and record differing positions and bases.
|
||||
*
|
||||
*/
|
||||
private void computeKmerCorrectionMap() {
|
||||
for (final KMerCounter.CountedKmer storedKmer : countsByKMer.getCountedKmers()) {
|
||||
if (storedKmer.getCount() >= minObservationsForKmerToBeSolid) {
|
||||
// this kmer is good: map to itself
|
||||
kmerCorrectionMap.put(storedKmer.getKmer(),storedKmer.getKmer());
|
||||
kmerDifferingBases.put(storedKmer.getKmer(),new Pair<>(new int[0],new byte[0])); // dummy empty array
|
||||
readErrorCorrectionStats.numSolidKmers++;
|
||||
}
|
||||
else if (storedKmer.getCount() <= maxObservationsForKmerToBeCorrectable) {
|
||||
// loop now thru all other kmers to find nearest neighbor
|
||||
final Pair<Kmer,Pair<int[],byte[]>> nearestNeighbor = findNearestNeighbor(storedKmer.getKmer(),countsByKMer,maxMismatchesToCorrect);
|
||||
|
||||
// check if nearest neighbor lies in a close vicinity. If so, log the new bases and the correction map
|
||||
if (nearestNeighbor != null) { // ok, found close neighbor
|
||||
kmerCorrectionMap.put(storedKmer.getKmer(), nearestNeighbor.first);
|
||||
kmerDifferingBases.put(storedKmer.getKmer(), nearestNeighbor.second);
|
||||
readErrorCorrectionStats.numCorrectedKmers++;
|
||||
// if (debug)
|
||||
// logger.info("Original kmer:"+storedKmer + "\tCorrected kmer:"+nearestNeighbor.first+"\tDistance:"+dist);
|
||||
}
|
||||
else
|
||||
readErrorCorrectionStats.numUncorrectableKmers++;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds nearest neighbor of a given k-mer, among a list of counted K-mers, up to a given distance.
|
||||
* If many k-mers share same closest distance, an arbitrary k-mer is picked
|
||||
* @param kmer K-mer of interest
|
||||
* @param countsByKMer KMerCounter storing set of counted k-mers (may include kmer of interest)
|
||||
* @param maxDistance Maximum distance to search
|
||||
* @return Pair of values: closest K-mer in Hamming distance and list of differing bases.
|
||||
* If no neighbor can be found up to given distance, returns null
|
||||
*/
|
||||
@Requires({"kmer != null", "countsByKMer != null","maxDistance >= 1"})
|
||||
private Pair<Kmer,Pair<int[],byte[]>> findNearestNeighbor(final Kmer kmer,
|
||||
final KMerCounter countsByKMer,
|
||||
final int maxDistance) {
|
||||
int minimumDistance = Integer.MAX_VALUE;
|
||||
Kmer closestKmer = null;
|
||||
|
||||
final int[] differingIndeces = new int[maxDistance+1];
|
||||
final byte[] differingBases = new byte[maxDistance+1];
|
||||
|
||||
final int[] closestDifferingIndices = new int[maxDistance+1];
|
||||
final byte[] closestDifferingBases = new byte[maxDistance+1];
|
||||
|
||||
for (final KMerCounter.CountedKmer candidateKmer : countsByKMer.getCountedKmers()) {
|
||||
// skip if candidate set includes test kmer
|
||||
if (candidateKmer.getKmer().equals(kmer))
|
||||
continue;
|
||||
|
||||
final int hammingDistance = kmer.getDifferingPositions(candidateKmer.getKmer(), maxDistance, differingIndeces, differingBases);
|
||||
if (hammingDistance < 0) // can't compare kmer? skip
|
||||
continue;
|
||||
|
||||
if (hammingDistance < minimumDistance) {
|
||||
minimumDistance = hammingDistance;
|
||||
closestKmer = candidateKmer.getKmer();
|
||||
System.arraycopy(differingBases,0,closestDifferingBases,0,differingBases.length);
|
||||
System.arraycopy(differingIndeces,0,closestDifferingIndices,0,differingIndeces.length);
|
||||
}
|
||||
}
|
||||
return new Pair<>(closestKmer, new Pair<>(closestDifferingIndices,closestDifferingBases));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* experimental function to compute max homopolymer length in a given reference context
|
||||
* @param fullReferenceWithPadding Reference context of interest
|
||||
* @return Max homopolymer length in region
|
||||
*/
|
||||
@Requires("fullReferenceWithPadding != null")
|
||||
private static int computeMaxHLen(final byte[] fullReferenceWithPadding) {
|
||||
|
||||
int leftRun = 1;
|
||||
int maxRun = 1;
|
||||
for ( int i = 1; i < fullReferenceWithPadding.length; i++) {
|
||||
if ( fullReferenceWithPadding[i] == fullReferenceWithPadding[i-1] )
|
||||
leftRun++;
|
||||
else
|
||||
leftRun = 1;
|
||||
}
|
||||
if (leftRun > maxRun)
|
||||
maxRun = leftRun;
|
||||
|
||||
|
||||
return maxRun;
|
||||
}
|
||||
|
||||
private static final class ReadErrorCorrectionStats {
|
||||
public int numReadsCorrected;
|
||||
public int numReadsUncorrected;
|
||||
public int numBasesCorrected;
|
||||
public int numSolidKmers;
|
||||
public int numUncorrectableKmers;
|
||||
public int numCorrectedKmers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapper utility class that holds, for each position in read, a list of bytes representing candidate corrections.
|
||||
* So, a read ACAGT where the middle A has found to be errorful might look like:
|
||||
* 0: {}
|
||||
* 1: {}
|
||||
* 2: {'C','C','C'}
|
||||
* 3: {}
|
||||
* 4: {}
|
||||
*
|
||||
* It's up to the method getConsensusCorrection() to decide how to use the correction sets for each position.
|
||||
* By default, only strict consensus is allowed right now.
|
||||
*
|
||||
*/
|
||||
protected static class CorrectionSet {
|
||||
private final int size;
|
||||
private ArrayList<List<Byte>> corrections;
|
||||
|
||||
/**
|
||||
* Main class constructor.
|
||||
* @param size Size of correction set, needs to be set equal to the read being corrected
|
||||
*/
|
||||
public CorrectionSet(final int size) {
|
||||
this.size = size;
|
||||
corrections = new ArrayList<>(size);
|
||||
for (int k=0; k < size; k++)
|
||||
corrections.add(k,new ArrayList<Byte>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a base to this correction set at a particular offset, measured from the start of the read
|
||||
* @param offset Offset from start of read
|
||||
* @param base base to be added to list of corrections at this offset
|
||||
*/
|
||||
public void add(final int offset, final byte base) {
|
||||
if (offset >= size || offset < 0)
|
||||
throw new IllegalStateException("Bad entry into CorrectionSet: offset > size");
|
||||
if (!BaseUtils.isRegularBase(base))
|
||||
return; // no irregular base correction
|
||||
|
||||
final List<Byte> storedBytes = corrections.get(offset);
|
||||
storedBytes.add(base);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of corrections for a particular offset
|
||||
* @param offset Offset of interest
|
||||
* @return List of bases representing possible corrections at this offset
|
||||
*/
|
||||
public List<Byte> get(final int offset) {
|
||||
if (offset >= size || offset < 0)
|
||||
throw new IllegalArgumentException("Illegal call of CorrectionSet.get(): offset must be < size");
|
||||
return corrections.get(offset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get consensus correction for a particular offset. In this implementation, it just boils down to seeing if
|
||||
* byte list associated with offset has identical values. If so, return this base, otherwise return null.
|
||||
* @param offset
|
||||
* @return Consensus base, or null if no consensus possible.
|
||||
*/
|
||||
public Byte getConsensusCorrection(final int offset) {
|
||||
if (offset >= size || offset < 0)
|
||||
throw new IllegalArgumentException("Illegal call of CorrectionSet.getConsensusCorrection(): offset must be < size");
|
||||
final List<Byte> storedBytes = corrections.get(offset);
|
||||
if (storedBytes.isEmpty())
|
||||
return null;
|
||||
|
||||
// todo - is there a cheaper/nicer way to compare if all elements in list are identical??
|
||||
final byte lastBase = storedBytes.remove(storedBytes.size()-1);
|
||||
for (final Byte b: storedBytes) {
|
||||
// strict correction rule: all bases must match
|
||||
if (b != lastBase)
|
||||
return null;
|
||||
}
|
||||
|
||||
// all bytes then are equal:
|
||||
return lastBase;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
/**
|
||||
* Holds information about a genotype call of a single sample reference vs. any non-ref event
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 6/21/13
|
||||
* Time: 12:58 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
final class RefVsAnyResult {
|
||||
/**
|
||||
* The genotype likelihoods for ref/ref ref/non-ref non-ref/non-ref
|
||||
*/
|
||||
final double[] genotypeLikelihoods = new double[3];
|
||||
|
||||
/**
|
||||
* AD field value for ref / non-ref
|
||||
*/
|
||||
final int[] AD_Ref_Any = new int[2];
|
||||
|
||||
/**
|
||||
* @return Get the DP (sum of AD values)
|
||||
*/
|
||||
public int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; }
|
||||
}
|
||||
|
|
@ -0,0 +1,476 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
|
||||
import org.broadinstitute.sting.utils.haplotypeBAMWriter.ReadDestination;
|
||||
import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFSimpleHeaderLine;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Code for estimating the reference confidence
|
||||
*
|
||||
* This code can estimate the probability that the data for a single sample is consistent with a
|
||||
* well-determined REF/REF diploid genotype.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 6/21/13
|
||||
* Time: 12:52 PM
|
||||
*/
|
||||
public class ReferenceConfidenceModel {
|
||||
public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF";
|
||||
public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site
|
||||
|
||||
public final static String INDEL_INFORMATIVE_DEPTH = "CD";
|
||||
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
private final Set<String> samples;
|
||||
private final SAMFileHeader header; // TODO -- really shouldn't depend on this
|
||||
private final int indelInformativeDepthIndelSize;
|
||||
|
||||
private final static boolean WRITE_DEBUGGING_BAM = false;
|
||||
private final SAMFileWriter debuggingWriter;
|
||||
|
||||
/**
|
||||
* Create a new ReferenceConfidenceModel
|
||||
*
|
||||
* @param genomeLocParser how we create genome locs
|
||||
* @param samples the list of all samples we'll be considering with this model
|
||||
* @param header the SAMFileHeader describing the read information (used for debugging)
|
||||
* @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths
|
||||
*/
|
||||
public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser,
|
||||
final Set<String> samples,
|
||||
final SAMFileHeader header,
|
||||
final int indelInformativeDepthIndelSize) {
|
||||
if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
|
||||
if ( samples == null ) throw new IllegalArgumentException("samples cannot be null");
|
||||
if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty");
|
||||
if ( header == null ) throw new IllegalArgumentException("header cannot be empty");
|
||||
if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize);
|
||||
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.samples = samples;
|
||||
this.header = header;
|
||||
this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize;
|
||||
|
||||
if ( WRITE_DEBUGGING_BAM ) {
|
||||
final SAMFileWriterFactory factory = new SAMFileWriterFactory();
|
||||
factory.setCreateIndex(true);
|
||||
debuggingWriter = factory.makeBAMWriter(header, false, new File("refCalc.bam"));
|
||||
} else {
|
||||
debuggingWriter = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the VCF header lines to include when emitting reference confidence values via calculateRefConfidence
|
||||
* @return a non-null set of VCFHeaderLines
|
||||
*/
|
||||
public Set<VCFHeaderLine> getVCFHeaderLines() {
|
||||
final Set<VCFHeaderLine> headerLines = new LinkedHashSet<>();
|
||||
headerLines.add(new VCFSimpleHeaderLine("ALT", NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
|
||||
headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize));
|
||||
return headerLines;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close down this reference model, closing down any debugging information opened during execution
|
||||
*/
|
||||
public void close() {
|
||||
if ( debuggingWriter != null ) debuggingWriter.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate the reference confidence for a single sample given the its read data
|
||||
*
|
||||
* Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing
|
||||
* detailed information about the certainty that the sample is hom-ref for each base in the region.
|
||||
*
|
||||
*
|
||||
*
|
||||
* @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc()
|
||||
* @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the
|
||||
* stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype.
|
||||
* @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc())
|
||||
* @param activeRegion the active region we want to get the reference confidence over
|
||||
* @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes
|
||||
* @param variantCalls calls made in this region. The return result will contain any variant call in this list in the
|
||||
* correct order by genomic position, and any variant in this list will stop us emitting a ref confidence
|
||||
* under any position is covers (for snps that 1 bp, but for deletion its the entire ref span)
|
||||
* @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence
|
||||
* contexts as well as calls from variantCalls if any were provided
|
||||
*/
|
||||
public List<VariantContext> calculateRefConfidence(final Haplotype refHaplotype,
|
||||
final Collection<Haplotype> calledHaplotypes,
|
||||
final GenomeLoc paddedReferenceLoc,
|
||||
final ActiveRegion activeRegion,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<VariantContext> variantCalls) {
|
||||
if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null");
|
||||
if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null");
|
||||
if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype");
|
||||
if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null");
|
||||
if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null");
|
||||
if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null");
|
||||
if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size());
|
||||
if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different");
|
||||
|
||||
final GenomeLoc refSpan = activeRegion.getLocation();
|
||||
final List<ReadBackedPileup> refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, refSpan, stratifiedReadMap);
|
||||
final byte[] ref = refHaplotype.getBases();
|
||||
final List<VariantContext> results = new ArrayList<>(refSpan.size());
|
||||
final String sampleName = stratifiedReadMap.keySet().iterator().next();
|
||||
|
||||
final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart();
|
||||
for ( final ReadBackedPileup pileup : refPileups ) {
|
||||
final GenomeLoc curPos = pileup.getLocation();
|
||||
final int offset = curPos.getStart() - refSpan.getStart();
|
||||
|
||||
final VariantContext overlappingSite = getOverlappingVariantContext(curPos, variantCalls);
|
||||
if ( overlappingSite != null ) {
|
||||
// we have some overlapping site, add it to the list of positions
|
||||
if ( overlappingSite.getStart() == curPos.getStart() )
|
||||
results.add(overlappingSite);
|
||||
} else {
|
||||
// otherwise emit a reference confidence variant context
|
||||
final int refOffset = offset + globalRefOffset;
|
||||
final byte refBase = ref[refOffset];
|
||||
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null);
|
||||
|
||||
final Allele refAllele = Allele.create(refBase, true);
|
||||
final List<Allele> refSiteAlleles = Arrays.asList(refAllele, NON_REF_SYMBOLIC_ALLELE);
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles);
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele));
|
||||
gb.AD(homRefCalc.AD_Ref_Any);
|
||||
gb.DP(homRefCalc.getDP());
|
||||
|
||||
// genotype likelihood calculation
|
||||
final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods);
|
||||
final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize);
|
||||
final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads);
|
||||
|
||||
// now that we have the SNP and indel GLs, we take the one with the least confidence,
|
||||
// as this is the most conservative estimate of our certainty that we are hom-ref.
|
||||
// For example, if the SNP PLs are 0,10,100 and the indel PLs are 0,100,1000
|
||||
// we are very certain that there's no indel here, but the SNP confidence imply that we are
|
||||
// far less confident that the ref base is actually the only thing here. So we take 0,10,100
|
||||
// as our GLs for the site.
|
||||
final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs);
|
||||
|
||||
gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF)));
|
||||
gb.PL(leastConfidenceGLs.getAsPLs());
|
||||
gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads);
|
||||
|
||||
vcb.genotypes(gb.make());
|
||||
results.add(vcb.make());
|
||||
// logger.info(" => VariantContext " + vcb.make());
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the GenotypeLikelihoods with the least strong corresponding GQ value
|
||||
* @param gl1 first to consider (cannot be null)
|
||||
* @param gl2 second to consider (cannot be null)
|
||||
* @return gl1 or gl2, whichever has the worst GQ
|
||||
*/
|
||||
protected final GenotypeLikelihoods getGLwithWorstGQ(final GenotypeLikelihoods gl1, final GenotypeLikelihoods gl2) {
|
||||
return gl1.getLog10GQ(GenotypeType.HOM_REF) > gl2.getLog10GQ(GenotypeType.HOM_REF) ? gl1 : gl2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get indel PLs corresponding to seeing N nIndelInformativeReads at this site
|
||||
*
|
||||
* @param nInformativeReads the number of reads that inform us about being ref without an indel at this site
|
||||
* @return non-null GenotypeLikelihoods given N
|
||||
*/
|
||||
protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) {
|
||||
// TODO -- optimization -- this could easily be optimized with some caching
|
||||
final double homRef = 0.0;
|
||||
final double het = - LOG10_2 * nInformativeReads;
|
||||
final double homVar = INDEL_ERROR_RATE * nInformativeReads;
|
||||
return GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar});
|
||||
}
|
||||
private final static double LOG10_2 = Math.log10(2);
|
||||
private final static double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
|
||||
|
||||
/**
|
||||
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
|
||||
*
|
||||
* @param pileup the read backed pileup containing the data we want to evaluate
|
||||
* @param refBase the reference base at this pileup position
|
||||
* @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation
|
||||
* @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
|
||||
* @return a RefVsAnyResult genotype call
|
||||
*/
|
||||
public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) {
|
||||
final RefVsAnyResult result = new RefVsAnyResult();
|
||||
|
||||
for( final PileupElement p : pileup ) {
|
||||
final byte qual = p.getQual();
|
||||
if( p.isDeletion() || qual > minBaseQual) {
|
||||
int AA = 0; final int AB = 1; int BB = 2;
|
||||
if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
|
||||
AA = 2;
|
||||
BB = 0;
|
||||
if( hqSoftClips != null && p.isNextToSoftClip() ) {
|
||||
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
|
||||
}
|
||||
result.AD_Ref_Any[1]++;
|
||||
} else {
|
||||
result.AD_Ref_Any[0]++;
|
||||
}
|
||||
result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
|
||||
result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
|
||||
result.genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of pileups that span the entire active region span, in order, one for each position
|
||||
*/
|
||||
private List<ReadBackedPileup> getPileupsOverReference(final Haplotype refHaplotype,
|
||||
final Collection<Haplotype> calledHaplotypes,
|
||||
final GenomeLoc paddedReferenceLoc,
|
||||
final GenomeLoc activeRegionSpan,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
||||
final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO");
|
||||
final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest);
|
||||
writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves
|
||||
writer.writeReadsAlignedToHaplotypes(calledHaplotypes.isEmpty() ? Collections.singleton(refHaplotype) : calledHaplotypes, paddedReferenceLoc, stratifiedReadMap);
|
||||
final List<GATKSAMRecord> realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads());
|
||||
|
||||
if ( debuggingWriter != null )
|
||||
for ( final GATKSAMRecord read : realignedReads )
|
||||
debuggingWriter.addAlignment(read);
|
||||
|
||||
final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING,
|
||||
false, genomeLocParser, samples, false);
|
||||
|
||||
final List<ReadBackedPileup> pileups = new LinkedList<>();
|
||||
final int startPos = activeRegionSpan.getStart();
|
||||
AlignmentContext next = libs.advanceToLocus(startPos, true);
|
||||
for ( int curPos = startPos; curPos <= activeRegionSpan.getStop(); curPos++ ) {
|
||||
if ( next != null && next.getLocation().getStart() == curPos ) {
|
||||
pileups.add(next.getBasePileup());
|
||||
next = libs.hasNext() ? libs.next() : null;
|
||||
} else {
|
||||
// no data, so we create empty pileups
|
||||
pileups.add(new ReadBackedPileupImpl(genomeLocParser.createGenomeLoc(activeRegionSpan.getContig(), curPos)));
|
||||
}
|
||||
}
|
||||
|
||||
return pileups;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the rightmost variant context in maybeOverlapping that overlaps curPos
|
||||
*
|
||||
* @param curPos non-null genome loc
|
||||
* @param maybeOverlapping a collection of variant contexts that might overlap curPos
|
||||
* @return a VariantContext, or null if none overlaps
|
||||
*/
|
||||
protected final VariantContext getOverlappingVariantContext(final GenomeLoc curPos, final Collection<VariantContext> maybeOverlapping) {
|
||||
VariantContext overlaps = null;
|
||||
for ( final VariantContext vc : maybeOverlapping ) {
|
||||
if ( genomeLocParser.createGenomeLoc(vc).overlapsP(curPos) ) {
|
||||
if ( overlaps == null || vc.getStart() > overlaps.getStart() ) {
|
||||
overlaps = vc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return overlaps;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the sum of mismatching base qualities for readBases aligned to refBases at readStart / refStart
|
||||
* assuming no insertions or deletions in the read w.r.t. the reference
|
||||
*
|
||||
* @param readBases non-null bases of the read
|
||||
* @param readQuals non-null quals of the read
|
||||
* @param readStart the starting position of the read (i.e., that aligns it to a position in the reference)
|
||||
* @param refBases the reference bases
|
||||
* @param refStart the offset into refBases that aligns to the readStart position in readBases
|
||||
* @param maxSum if the sum goes over this value, return immediately
|
||||
* @return the sum of quality scores for readBases that mismatch their corresponding ref bases
|
||||
*/
|
||||
protected final int sumMismatchingQualities(final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final int readStart,
|
||||
final byte[] refBases,
|
||||
final int refStart,
|
||||
final int maxSum) {
|
||||
final int n = Math.min(readBases.length - readStart, refBases.length - refStart);
|
||||
int sum = 0;
|
||||
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
final byte readBase = readBases[readStart + i];
|
||||
final byte refBase = refBases[refStart + i];
|
||||
if ( readBase != refBase ) {
|
||||
sum += readQuals[readStart + i];
|
||||
if ( sum > maxSum )
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart
|
||||
*
|
||||
* @param readBases non-null bases of the read
|
||||
* @param readQuals non-null quals of the read
|
||||
* @param readStart the starting position of the read (i.e., that aligns it to a position in the reference)
|
||||
* @param refBases the reference bases
|
||||
* @param refStart the offset into refBases that aligns to the readStart position in readBases
|
||||
* @param maxIndelSize the max indel size to consider for the read to be informative
|
||||
* @return true if read can eliminate the possibility that there's an indel of size <= maxIndelSize segregating at refStart
|
||||
*/
|
||||
protected boolean isReadInformativeAboutIndelsOfSize(final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final int readStart,
|
||||
final byte[] refBases,
|
||||
final int refStart,
|
||||
final int maxIndelSize) {
|
||||
// todo -- fast exit when n bases left < maxIndelSize
|
||||
|
||||
final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE);
|
||||
|
||||
// consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion
|
||||
// or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores
|
||||
for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) {
|
||||
for ( final boolean checkInsertion : Arrays.asList(true, false) ) {
|
||||
final int readI, refI;
|
||||
if ( checkInsertion ) {
|
||||
readI = readStart + indelSize;
|
||||
refI = refStart;
|
||||
} else {
|
||||
readI = readStart;
|
||||
refI = refStart + indelSize;
|
||||
}
|
||||
|
||||
final int score = sumMismatchingQualities(readBases, readQuals, readI, refBases, refI, baselineMMSum);
|
||||
if ( score <= baselineMMSum )
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the number of indel informative reads at pileup
|
||||
*
|
||||
* @param pileup a pileup
|
||||
* @param pileupOffsetIntoRef the position of the pileup in the reference
|
||||
* @param ref the ref bases
|
||||
* @param maxIndelSize maximum indel size to consider in the informativeness calculation
|
||||
* @return an integer >= 0
|
||||
*/
|
||||
protected final int calcNIndelInformativeReads(final ReadBackedPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) {
|
||||
int nInformative = 0;
|
||||
for ( final PileupElement p : pileup ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
final int offset = p.getOffset();
|
||||
|
||||
// doesn't count as evidence
|
||||
if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() )
|
||||
continue;
|
||||
|
||||
// todo -- this code really should handle CIGARs directly instead of relying on the above tests
|
||||
if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize))
|
||||
nInformative++;
|
||||
}
|
||||
return nInformative;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a reference haplotype for an active region
|
||||
*
|
||||
* @param activeRegion the active region
|
||||
* @param refBases the ref bases
|
||||
* @param paddedReferenceLoc the location spanning of the refBases -- can be longer than activeRegion.getLocation()
|
||||
* @return a reference haplotype
|
||||
*/
|
||||
public static Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final byte[] refBases, final GenomeLoc paddedReferenceLoc) {
|
||||
final Haplotype refHaplotype = new Haplotype(refBases, true);
|
||||
final int alignmentStart = activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart();
|
||||
if ( alignmentStart < 0 ) throw new IllegalStateException("Bad alignment start in createReferenceHaplotype " + alignmentStart);
|
||||
refHaplotype.setAlignmentStartHapwrtRef(alignmentStart);
|
||||
final Cigar c = new Cigar();
|
||||
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
|
||||
refHaplotype.setCigar(c);
|
||||
return refHaplotype;
|
||||
}
|
||||
}
|
||||
|
|
@ -309,7 +309,7 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
}
|
||||
v = getNextReferenceVertex(v); // advance along the reference path
|
||||
while( v != null && !v.equals(toVertex) ) {
|
||||
bytes = ArrayUtils.addAll( bytes, getAdditionalSequence(v) );
|
||||
bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v));
|
||||
v = getNextReferenceVertex(v); // advance along the reference path
|
||||
}
|
||||
if( includeStop && v != null && v.equals(toVertex)) {
|
||||
|
|
@ -388,6 +388,17 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of vertices connected to v by incoming or outgoing edges
|
||||
* @param v a non-null vertex
|
||||
* @return a set of vertices {X} connected X -> v or v -> Y
|
||||
*/
|
||||
public Set<V> neighboringVerticesOf(final V v) {
|
||||
final Set<V> s = incomingVerticesOf(v);
|
||||
s.addAll(outgoingVerticesOf(v));
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print out the graph in the dot language for visualization
|
||||
* @param destination File to write to
|
||||
|
|
@ -550,7 +561,7 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
verticesToRemove.removeAll(onPathFromRefSource);
|
||||
removeAllVertices(verticesToRemove);
|
||||
|
||||
// simple santity checks that this algorithm is working.
|
||||
// simple sanity checks that this algorithm is working.
|
||||
if ( getSinks().size() > 1 ) {
|
||||
throw new IllegalStateException("Should have eliminated all but the reference sink, but found " + getSinks());
|
||||
}
|
||||
|
|
@ -664,4 +675,72 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
"kmerSize=" + kmerSize +
|
||||
'}';
|
||||
}
|
||||
|
||||
/**
|
||||
* The base sequence for the given path.
|
||||
* Note, this assumes that the path does not start with a source node.
|
||||
*
|
||||
* @param path the list of vertexes that make up the path
|
||||
* @return non-null sequence of bases corresponding to the given path
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
public byte[] getBasesForPath(final List<? extends DeBruijnVertex> path) {
|
||||
if ( path == null ) throw new IllegalArgumentException("Path cannot be null");
|
||||
|
||||
final StringBuffer sb = new StringBuffer();
|
||||
for ( final DeBruijnVertex v : path )
|
||||
sb.append((char)v.getSuffix());
|
||||
|
||||
return sb.toString().getBytes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of vertices within distance edges of source, regardless of edge direction
|
||||
*
|
||||
* @param source the source vertex to consider
|
||||
* @param distance the distance
|
||||
* @return a set of vertices within distance of source
|
||||
*/
|
||||
protected Set<V> verticesWithinDistance(final V source, final int distance) {
|
||||
if ( distance == 0 )
|
||||
return Collections.singleton(source);
|
||||
|
||||
final Set<V> found = new HashSet<>();
|
||||
found.add(source);
|
||||
for ( final V v : neighboringVerticesOf(source) ) {
|
||||
found.addAll(verticesWithinDistance(v, distance - 1));
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a graph containing only the vertices within distance edges of target
|
||||
* @param target a vertex in graph
|
||||
* @param distance the max distance
|
||||
* @return a non-null graph
|
||||
*/
|
||||
public BaseGraph<V,E> subsetToNeighbors(final V target, final int distance) {
|
||||
if ( target == null ) throw new IllegalArgumentException("Target cannot be null");
|
||||
if ( ! containsVertex(target) ) throw new IllegalArgumentException("Graph doesn't contain vertex " + target);
|
||||
if ( distance < 0 ) throw new IllegalArgumentException("Distance must be >= 0 but got " + distance);
|
||||
|
||||
|
||||
final Set<V> toKeep = verticesWithinDistance(target, distance);
|
||||
final Set<V> toRemove = new HashSet<>(vertexSet());
|
||||
toRemove.removeAll(toKeep);
|
||||
|
||||
final BaseGraph<V,E> result = (BaseGraph<V,E>)clone();
|
||||
result.removeAllVertices(toRemove);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a subgraph of graph that contains only vertices within 10 edges of the ref source vertex
|
||||
* @return a non-null subgraph of this graph
|
||||
*/
|
||||
public BaseGraph<V,E> subsetToRefSource() {
|
||||
return subsetToNeighbors(getReferenceSourceVertex(), 10);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -171,7 +171,15 @@ final public class GraphUtils {
|
|||
return foundDup ? null : new PrimitivePair.Int(longestPos, length);
|
||||
}
|
||||
|
||||
private static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
|
||||
/**
|
||||
* calculates the longest suffix match between a sequence and a smaller kmer
|
||||
*
|
||||
* @param seq the (reference) sequence
|
||||
* @param kmer the smaller kmer sequence
|
||||
* @param seqStart the index (inclusive) on seq to start looking backwards from
|
||||
* @return the longest matching suffix
|
||||
*/
|
||||
public static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
|
||||
for ( int len = 1; len <= kmer.length; len++ ) {
|
||||
final int seqI = seqStart - len + 1;
|
||||
final int kmerI = kmer.length - len;
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ public class LowWeightChainPruner<V extends BaseVertex, E extends BaseEdge> {
|
|||
final Set<E> edgesToKeep = new LinkedHashSet<>();
|
||||
|
||||
for ( final Path<V,E> linearChain : getLinearChains(graph) ) {
|
||||
if( mustBeKeep(linearChain, pruneFactor) ) {
|
||||
if( mustBeKept(linearChain, pruneFactor) ) {
|
||||
// we must keep edges in any path that contains a reference edge or an edge with weight > pruneFactor
|
||||
edgesToKeep.addAll(linearChain.getEdges());
|
||||
}
|
||||
|
|
@ -96,10 +96,14 @@ public class LowWeightChainPruner<V extends BaseVertex, E extends BaseEdge> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get the maximum pruning multiplicity seen on any edge in this graph
|
||||
* @return an integer > 0
|
||||
* Traverse the edges in the path and determine if any are either ref edges or have weight above
|
||||
* the pruning factor and should therefore not be pruned away.
|
||||
*
|
||||
* @param path the path in question
|
||||
* @param pruneFactor the integer pruning factor
|
||||
* @return true if any edge in the path must be kept
|
||||
*/
|
||||
private boolean mustBeKeep(final Path<V,E> path, final int pruneFactor) {
|
||||
private boolean mustBeKept(final Path<V, E> path, final int pruneFactor) {
|
||||
for ( final E edge : path.getEdges() ) {
|
||||
if ( edge.getPruningMultiplicity() >= pruneFactor || edge.isRef() )
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -46,6 +46,8 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import java.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* edge class for connecting nodes in the graph that tracks some per-sample information
|
||||
*
|
||||
|
|
@ -63,32 +65,43 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
|||
* e.getPruningMultiplicity() // = 3
|
||||
*/
|
||||
public class MultiSampleEdge extends BaseEdge {
|
||||
private int maxSingleSampleMultiplicity, currentSingleSampleMultiplicity;
|
||||
private int currentSingleSampleMultiplicity;
|
||||
private final int singleSampleCapacity;
|
||||
private final PriorityQueue<Integer> singleSampleMultiplicities;
|
||||
|
||||
/**
|
||||
* Create a new MultiSampleEdge with weight multiplicity and, if isRef == true, indicates a path through the reference
|
||||
*
|
||||
* @param isRef indicates whether this edge is a path through the reference
|
||||
* @param multiplicity the number of observations of this edge in this sample
|
||||
* @param singleSampleCapacity the max number of samples to track edge multiplicities
|
||||
*/
|
||||
public MultiSampleEdge(final boolean isRef, final int multiplicity) {
|
||||
public MultiSampleEdge(final boolean isRef, final int multiplicity, final int singleSampleCapacity) {
|
||||
super(isRef, multiplicity);
|
||||
maxSingleSampleMultiplicity = multiplicity;
|
||||
|
||||
if( singleSampleCapacity <= 0 ) { throw new IllegalArgumentException("singleSampleCapacity must be > 0 but found: " + singleSampleCapacity); }
|
||||
singleSampleMultiplicities = new PriorityQueue<>(singleSampleCapacity);
|
||||
singleSampleMultiplicities.add(multiplicity);
|
||||
currentSingleSampleMultiplicity = multiplicity;
|
||||
this.singleSampleCapacity = singleSampleCapacity;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MultiSampleEdge copy() {
|
||||
return new MultiSampleEdge(isRef(), getMultiplicity()); // TODO -- should I copy values for other features?
|
||||
return new MultiSampleEdge(isRef(), getMultiplicity(), singleSampleCapacity); // TODO -- should I copy values for other features?
|
||||
}
|
||||
|
||||
/**
|
||||
* update the max single sample multiplicity based on the current single sample multiplicity, and
|
||||
* update the single sample multiplicities by adding the current single sample multiplicity to the priority queue, and
|
||||
* reset the current single sample multiplicity to 0.
|
||||
*/
|
||||
public void flushSingleSampleMultiplicity() {
|
||||
if ( currentSingleSampleMultiplicity > maxSingleSampleMultiplicity )
|
||||
maxSingleSampleMultiplicity = currentSingleSampleMultiplicity;
|
||||
singleSampleMultiplicities.add(currentSingleSampleMultiplicity);
|
||||
if( singleSampleMultiplicities.size() == singleSampleCapacity + 1 ) {
|
||||
singleSampleMultiplicities.poll(); // remove the lowest multiplicity from the list
|
||||
} else if( singleSampleMultiplicities.size() > singleSampleCapacity + 1 ) {
|
||||
throw new IllegalStateException("Somehow the per sample multiplicity list has grown too big: " + singleSampleMultiplicities);
|
||||
}
|
||||
currentSingleSampleMultiplicity = 0;
|
||||
}
|
||||
|
||||
|
|
@ -100,20 +113,12 @@ public class MultiSampleEdge extends BaseEdge {
|
|||
|
||||
@Override
|
||||
public int getPruningMultiplicity() {
|
||||
return getMaxSingleSampleMultiplicity();
|
||||
return singleSampleMultiplicities.peek();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDotLabel() {
|
||||
return super.getDotLabel() + "/" + getMaxSingleSampleMultiplicity();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the maximum multiplicity for this edge seen in any single sample
|
||||
* @return an integer >= 0
|
||||
*/
|
||||
public int getMaxSingleSampleMultiplicity() {
|
||||
return maxSingleSampleMultiplicity;
|
||||
return super.getDotLabel() + "/" + getPruningMultiplicity();
|
||||
}
|
||||
|
||||
/** only provided for testing purposes */
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
|
|
@ -92,7 +91,7 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
|||
/**
|
||||
* Create a new Path containing no edges and starting at initialVertex
|
||||
* @param initialVertex the starting vertex of the path
|
||||
* @param graph the graph this path with follow through
|
||||
* @param graph the graph this path will follow through
|
||||
*/
|
||||
public Path(final T initialVertex, final BaseGraph<T, E> graph) {
|
||||
if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null");
|
||||
|
|
|
|||
|
|
@ -155,20 +155,29 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
//logger.info("simplifyGraph iteration " + i);
|
||||
// iterate until we haven't don't anything useful
|
||||
boolean didSomeWork = false;
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0);
|
||||
printGraphSimplification(new File("simplifyGraph." + iteration + ".1.dot"));
|
||||
didSomeWork |= new MergeDiamonds().transformUntilComplete();
|
||||
didSomeWork |= new MergeTails().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0);
|
||||
printGraphSimplification(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"));
|
||||
|
||||
didSomeWork |= new SplitCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0);
|
||||
printGraphSimplification(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"));
|
||||
didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0);
|
||||
printGraphSimplification(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"));
|
||||
|
||||
didSomeWork |= zipLinearChains();
|
||||
return didSomeWork;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print simplication step of this graph, if PRINT_SIMPLIFY_GRAPHS is enabled
|
||||
* @param file the destination for the graph DOT file
|
||||
*/
|
||||
private void printGraphSimplification(final File file) {
|
||||
if ( PRINT_SIMPLIFY_GRAPHS )
|
||||
subsetToNeighbors(getReferenceSourceVertex(), 5).printGraph(file, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Zip up all of the simple linear chains present in this graph.
|
||||
*
|
||||
|
|
@ -352,7 +361,7 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
* Merge until the graph has no vertices that are candidates for merging
|
||||
*/
|
||||
public boolean transformUntilComplete() {
|
||||
boolean didAtLeastOneTranform = false;
|
||||
boolean didAtLeastOneTransform = false;
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
|
|
@ -360,13 +369,13 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
for( final SeqVertex v : vertexSet() ) {
|
||||
foundNodesToMerge = tryToTransform(v);
|
||||
if ( foundNodesToMerge ) {
|
||||
didAtLeastOneTranform = true;
|
||||
didAtLeastOneTransform = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return didAtLeastOneTranform;
|
||||
return didAtLeastOneTransform;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ public class SharedSequenceMerger {
|
|||
else {
|
||||
// graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0);
|
||||
|
||||
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
|
||||
final List<BaseEdge> edgesToRemove = new LinkedList<>();
|
||||
final byte[] prevSeq = prevs.iterator().next().getSequence();
|
||||
final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence()));
|
||||
graph.addVertex(newV);
|
||||
|
|
@ -124,11 +124,17 @@ public class SharedSequenceMerger {
|
|||
final SeqVertex first = incomingVertices.iterator().next();
|
||||
for ( final SeqVertex prev : incomingVertices) {
|
||||
if ( ! prev.seqEquals(first) )
|
||||
// cannot merge if our sequence isn't the same as the first sequence
|
||||
return false;
|
||||
final Collection<SeqVertex> prevOuts = graph.outgoingVerticesOf(prev);
|
||||
if ( prevOuts.size() != 1 )
|
||||
// prev -> v must be the only edge from prev
|
||||
return false;
|
||||
if ( prevOuts.iterator().next() != v )
|
||||
// don't allow cyles
|
||||
return false;
|
||||
if ( graph.inDegreeOf(prev) == 0 )
|
||||
// cannot merge when any of the incoming nodes are sources
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,14 +47,15 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResult;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -62,11 +63,16 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
private final static Logger logger = Logger.getLogger(ReadThreadingAssembler.class);
|
||||
|
||||
private final static int DEFAULT_NUM_PATHS_PER_GRAPH = 128;
|
||||
private final static int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
|
||||
private final static int KMER_SIZE_ITERATION_INCREASE = 10;
|
||||
private final static int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6;
|
||||
|
||||
/** The min and max kmer sizes to try when building the graph. */
|
||||
private final List<Integer> kmerSizes;
|
||||
private final int maxAllowedPathsForReadThreadingAssembler;
|
||||
|
||||
private final boolean dontIncreaseKmerSizesForCycles;
|
||||
private final int numPruningSamples;
|
||||
private boolean requireReasonableNumberOfPaths = false;
|
||||
protected boolean removePathsNotConnectedToRef = true;
|
||||
private boolean justReturnRawGraph = false;
|
||||
|
|
@ -76,10 +82,16 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25));
|
||||
}
|
||||
|
||||
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
|
||||
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes, final boolean dontIncreaseKmerSizesForCycles, final int numPruningSamples) {
|
||||
super(maxAllowedPathsForReadThreadingAssembler);
|
||||
this.kmerSizes = kmerSizes;
|
||||
this.maxAllowedPathsForReadThreadingAssembler = maxAllowedPathsForReadThreadingAssembler;
|
||||
this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles;
|
||||
this.numPruningSamples = numPruningSamples;
|
||||
}
|
||||
|
||||
public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes) {
|
||||
this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true, 1);
|
||||
}
|
||||
|
||||
/** for testing purposes */
|
||||
|
|
@ -87,58 +99,116 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
this.justReturnRawGraph = justReturnRawGraph;
|
||||
}
|
||||
|
||||
private void addResult(final List<AssemblyResult> results, final AssemblyResult maybeNullResult) {
|
||||
if ( maybeNullResult != null )
|
||||
results.add(maybeNullResult);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SeqGraph> assemble( final List<GATKSAMRecord> reads, final Haplotype refHaplotype) {
|
||||
final List<SeqGraph> graphs = new LinkedList<>();
|
||||
public List<AssemblyResult> assemble(final List<GATKSAMRecord> reads, final Haplotype refHaplotype, final List<Haplotype> activeAlleleHaplotypes) {
|
||||
final List<AssemblyResult> results = new LinkedList<>();
|
||||
|
||||
// first, try using the requested kmer sizes
|
||||
for ( final int kmerSize : kmerSizes ) {
|
||||
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly);
|
||||
addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles));
|
||||
}
|
||||
|
||||
// add the reference sequence to the graph
|
||||
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
||||
|
||||
// Next pull kmers out of every read and throw them on the graph
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
rtgraph.addRead(read);
|
||||
}
|
||||
|
||||
// actually build the read threading graph
|
||||
rtgraph.buildGraphIfNecessary();
|
||||
if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.0.raw_readthreading_graph.dot"), pruneFactor);
|
||||
|
||||
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
|
||||
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
|
||||
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
|
||||
rtgraph.pruneLowWeightChains(pruneFactor);
|
||||
|
||||
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
|
||||
// we can recover them by merging some N bases from the chain back into the reference uniquely, for
|
||||
// N < kmerSize
|
||||
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
|
||||
|
||||
// remove all heading and trailing paths
|
||||
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
|
||||
|
||||
if ( debugGraphTransformations ) rtgraph.printGraph(new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"), pruneFactor);
|
||||
|
||||
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
|
||||
|
||||
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
|
||||
if ( justReturnRawGraph ) return Collections.singletonList(initialSeqGraph);
|
||||
|
||||
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
|
||||
if ( debugGraphTransformations ) initialSeqGraph.printGraph(new File("sequenceGraph.0.2.initial_seqgraph.dot"), pruneFactor);
|
||||
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
|
||||
|
||||
final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph);
|
||||
if ( seqGraph != null ) {
|
||||
if ( ! requireReasonableNumberOfPaths || reasonableNumberOfPaths(seqGraph) ) {
|
||||
graphs.add(seqGraph);
|
||||
}
|
||||
// if none of those worked, iterate over larger sizes if allowed to do so
|
||||
if ( results.isEmpty() && !dontIncreaseKmerSizesForCycles ) {
|
||||
int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE;
|
||||
int numIterations = 1;
|
||||
while ( results.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
|
||||
// on the last attempt we will allow low complexity graphs
|
||||
addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT));
|
||||
kmerSize += KMER_SIZE_ITERATION_INCREASE;
|
||||
numIterations++;
|
||||
}
|
||||
}
|
||||
|
||||
return graphs;
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the sequence graph for the given kmerSize
|
||||
*
|
||||
* @param reads reads to use
|
||||
* @param refHaplotype reference haplotype
|
||||
* @param kmerSize kmer size
|
||||
* @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
|
||||
* @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
|
||||
* @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
|
||||
*/
|
||||
protected AssemblyResult createGraph(final List<GATKSAMRecord> reads,
|
||||
final Haplotype refHaplotype,
|
||||
final int kmerSize,
|
||||
final List<Haplotype> activeAlleleHaplotypes,
|
||||
final boolean allowLowComplexityGraphs) {
|
||||
if ( refHaplotype.length() < kmerSize ) {
|
||||
// happens in cases where the assembled region is just too small
|
||||
return new AssemblyResult(AssemblyResult.Status.FAILED, null);
|
||||
}
|
||||
|
||||
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples);
|
||||
|
||||
// add the reference sequence to the graph
|
||||
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
||||
|
||||
// add the artificial GGA haplotypes to the graph
|
||||
int hapCount = 0;
|
||||
for ( final Haplotype h : activeAlleleHaplotypes ) {
|
||||
final int[] counts = new int[h.length()];
|
||||
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
|
||||
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
|
||||
}
|
||||
|
||||
// Next pull kmers out of every read and throw them on the graph
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
rtgraph.addRead(read);
|
||||
}
|
||||
|
||||
// actually build the read threading graph
|
||||
rtgraph.buildGraphIfNecessary();
|
||||
|
||||
// sanity check: make sure there are no cycles in the graph
|
||||
if ( rtgraph.hasCycles() ) {
|
||||
if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle");
|
||||
return null;
|
||||
}
|
||||
|
||||
// sanity check: make sure the graph had enough complexity with the given kmer
|
||||
if ( ! allowLowComplexityGraphs && rtgraph.isLowComplexity() ) {
|
||||
if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity");
|
||||
return null;
|
||||
}
|
||||
|
||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
|
||||
|
||||
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
|
||||
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
|
||||
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
|
||||
rtgraph.pruneLowWeightChains(pruneFactor);
|
||||
|
||||
// look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if
|
||||
// we can recover them by merging some N bases from the chain back into the reference
|
||||
if ( recoverDanglingTails ) rtgraph.recoverDanglingTails();
|
||||
|
||||
// remove all heading and trailing paths
|
||||
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
|
||||
|
||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
|
||||
|
||||
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
|
||||
|
||||
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
|
||||
if ( justReturnRawGraph ) return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph);
|
||||
|
||||
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
|
||||
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
|
||||
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
|
||||
|
||||
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
|
||||
final AssemblyResult.Status status = cleaned.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(cleaned.getGraph()) ? AssemblyResult.Status.FAILED : cleaned.getStatus();
|
||||
return new AssemblyResult(status, cleaned.getGraph());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -147,7 +217,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
* @return
|
||||
*/
|
||||
private boolean reasonableNumberOfPaths(final SeqGraph graph) {
|
||||
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<SeqVertex,BaseEdge>(false);
|
||||
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<>(false);
|
||||
final List<Path<SeqVertex,BaseEdge>> allPaths = pathFinder.getKBestPaths(graph, 100000);
|
||||
logger.info("Found " + allPaths.size() + " paths through " + graph + " with maximum " + maxAllowedPathsForReadThreadingAssembler);
|
||||
return allPaths.size() <= maxAllowedPathsForReadThreadingAssembler;
|
||||
|
|
|
|||
|
|
@ -46,27 +46,45 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.collections.PrimitivePair;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman;
|
||||
import org.jgrapht.EdgeFactory;
|
||||
import org.jgrapht.alg.CycleDetector;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> {
|
||||
/**
|
||||
* Edge factory that creates non-reference multiplicity 1 edges
|
||||
* Edge factory that encapsulates the numPruningSamples assembly parameter
|
||||
*/
|
||||
private static class MyEdgeFactory implements EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> {
|
||||
@Override
|
||||
public MultiSampleEdge createEdge(MultiDeBruijnVertex sourceVertex, MultiDeBruijnVertex targetVertex) {
|
||||
return new MultiSampleEdge(false, 1);
|
||||
final int numPruningSamples;
|
||||
|
||||
public MyEdgeFactory(int numPruningSamples) {
|
||||
this.numPruningSamples = numPruningSamples;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MultiSampleEdge createEdge(final MultiDeBruijnVertex sourceVertex, final MultiDeBruijnVertex targetVertex) {
|
||||
return new MultiSampleEdge(false, 1, numPruningSamples);
|
||||
}
|
||||
|
||||
public MultiSampleEdge createEdge(final boolean isRef, final int multiplicity) {
|
||||
return new MultiSampleEdge(isRef, multiplicity, numPruningSamples);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private final static Logger logger = Logger.getLogger(ReadThreadingGraph.class);
|
||||
|
|
@ -78,13 +96,10 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
/** for debugging info printing */
|
||||
private static int counter = 0;
|
||||
|
||||
/** we require at least this many bases to be uniquely matching to merge a dangling tail */
|
||||
private final static int MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL = 5;
|
||||
|
||||
/**
|
||||
* Sequences added for read threading before we've actually built the graph
|
||||
*/
|
||||
private final Map<String, List<SequenceForKmers>> pending = new LinkedHashMap<String, List<SequenceForKmers>>();
|
||||
private final Map<String, List<SequenceForKmers>> pending = new LinkedHashMap<>();
|
||||
|
||||
/**
|
||||
* A set of non-unique kmers that cannot be used as merge points in the graph
|
||||
|
|
@ -94,7 +109,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
/**
|
||||
* A map from kmers -> their corresponding vertex in the graph
|
||||
*/
|
||||
private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<Kmer, MultiDeBruijnVertex>();
|
||||
private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<>();
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -111,23 +126,21 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
// --------------------------------------------------------------------------------
|
||||
private Kmer refSource;
|
||||
private boolean alreadyBuilt;
|
||||
byte[] refSeq;
|
||||
MultiDeBruijnVertex[] refKmers;
|
||||
|
||||
public ReadThreadingGraph() {
|
||||
this(25, false, (byte)6);
|
||||
this(25, false, (byte)6, 1);
|
||||
}
|
||||
|
||||
public ReadThreadingGraph(final int kmerSize) {
|
||||
this(kmerSize, false, (byte)6);
|
||||
this(kmerSize, false, (byte)6, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ReadThreadingAssembler using kmerSize for matching
|
||||
* @param kmerSize must be >= 1
|
||||
*/
|
||||
protected ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly) {
|
||||
super(kmerSize, new MyEdgeFactory());
|
||||
protected ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples) {
|
||||
super(kmerSize, new MyEdgeFactory(numPruningSamples));
|
||||
|
||||
if ( kmerSize < 1 ) throw new IllegalArgumentException("bad minkKmerSize " + kmerSize);
|
||||
this.kmerSize = kmerSize;
|
||||
|
|
@ -146,8 +159,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
uniqueKmers.clear();
|
||||
refSource = null;
|
||||
alreadyBuilt = false;
|
||||
refSeq = null;
|
||||
refKmers = null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -224,13 +235,10 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
|
||||
if ( debugGraphTransformations ) startingVertex.addRead(seqForKmers.name);
|
||||
|
||||
// keep track of information about the reference kmers for merging dangling tails
|
||||
// keep track of information about the reference source
|
||||
if ( seqForKmers.isRef ) {
|
||||
if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev " + refSource + " new is " + startingVertex);
|
||||
if ( refSource != null ) throw new IllegalStateException("Found two refSources! prev: " + refSource + ", new: " + startingVertex);
|
||||
refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize);
|
||||
refSeq = seqForKmers.sequence;
|
||||
refKmers = new MultiDeBruijnVertex[refSeq.length];
|
||||
for ( int i = 0; i < kmerSize; i++ ) refKmers[i] = null;
|
||||
}
|
||||
|
||||
// loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate
|
||||
|
|
@ -240,54 +248,161 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
|
||||
vertex = extendChainByOne(vertex, seqForKmers.sequence, i, count, seqForKmers.isRef);
|
||||
if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name);
|
||||
|
||||
// keep track of the reference kmers for merging dangling tails
|
||||
if ( seqForKmers.isRef ) refKmers[i + kmerSize - 1] = vertex;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to attach vertex with out-degree == 0 to the graph by finding a unique matching kmer to the reference
|
||||
* Class to keep track of the important dangling tail merging data
|
||||
*/
|
||||
protected final class DanglingTailMergeResult {
|
||||
final List<MultiDeBruijnVertex> danglingPath, referencePath;
|
||||
final byte[] danglingPathString, referencePathString;
|
||||
final Cigar cigar;
|
||||
|
||||
public DanglingTailMergeResult(final List<MultiDeBruijnVertex> danglingPath,
|
||||
final List<MultiDeBruijnVertex> referencePath,
|
||||
final byte[] danglingPathString,
|
||||
final byte[] referencePathString,
|
||||
final Cigar cigar) {
|
||||
this.danglingPath = danglingPath;
|
||||
this.referencePath = referencePath;
|
||||
this.danglingPathString = danglingPathString;
|
||||
this.referencePathString = referencePathString;
|
||||
this.cigar = cigar;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to attach vertex with out-degree == 0 to the graph
|
||||
*
|
||||
* @param vertex the vertex to recover
|
||||
* @return 1 if we successfully recovered the vertex and 0 otherwise
|
||||
*/
|
||||
protected int recoverDanglingChain(final MultiDeBruijnVertex vertex) {
|
||||
if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0");
|
||||
|
||||
final byte[] kmer = vertex.getSequence();
|
||||
if ( ! nonUniqueKmers.contains(new Kmer(kmer)) ) {
|
||||
// don't attempt to fix non-unique kmers!
|
||||
final MultiDeBruijnVertex uniqueMergePoint = danglingTailMergePoint(kmer);
|
||||
if ( uniqueMergePoint != null ) {
|
||||
addEdge(vertex, uniqueMergePoint, new MultiSampleEdge(false, 1));
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
// generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
|
||||
final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex);
|
||||
|
||||
return 0;
|
||||
// if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path
|
||||
if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) )
|
||||
return 0;
|
||||
|
||||
// merge
|
||||
return mergeDanglingTail(danglingTailMergeResult);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find a unique merge point for kmer in the reference sequence
|
||||
* @param kmer the full kmer of the dangling tail
|
||||
* @return a vertex appropriate to merge kmer into, or null if none could be found
|
||||
* Determine whether the provided cigar is okay to merge into the reference path
|
||||
*
|
||||
* @param cigar the cigar to analyze
|
||||
* @return true if it's okay to merge, false otherwise
|
||||
*/
|
||||
private MultiDeBruijnVertex danglingTailMergePoint(final byte[] kmer) {
|
||||
final PrimitivePair.Int endAndLength = GraphUtils.findLongestUniqueSuffixMatch(refSeq, kmer);
|
||||
if ( endAndLength != null && endAndLength.second >= MIN_MATCH_LENGTH_TO_RECOVER_DANGLING_TAIL && endAndLength.first + 1 < refKmers.length) {
|
||||
final int len = endAndLength.second;
|
||||
final MultiDeBruijnVertex mergePoint = refKmers[endAndLength.first + 1];
|
||||
// logger.info("recoverDanglingChain of kmer " + new String(kmer) + " merged to " + mergePoint + " with match size " + len);
|
||||
final Set<Kmer> nonUniquesAtLength = determineKmerSizeAndNonUniques(len, len).nonUniques;
|
||||
final Kmer matchedKmer = new Kmer(kmer, kmer.length - len, len);
|
||||
if ( nonUniquesAtLength.contains(matchedKmer) ) {
|
||||
// logger.info("Rejecting merge " + new String(kmer) + " because match kmer " + matchedKmer + " isn't unique across all reads");
|
||||
return null;
|
||||
} else {
|
||||
return mergePoint;
|
||||
}
|
||||
protected boolean cigarIsOkayToMerge(final Cigar cigar) {
|
||||
|
||||
final List<CigarElement> elements = cigar.getCigarElements();
|
||||
|
||||
// don't allow more than a couple of different ops
|
||||
if ( elements.size() > 3 )
|
||||
return false;
|
||||
|
||||
// the last element must be an M
|
||||
if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M )
|
||||
return false;
|
||||
|
||||
// TODO -- do we want to check whether the Ms mismatch too much also?
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually merge the dangling tail if possible
|
||||
*
|
||||
* @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference
|
||||
* @return 1 if merge was successful, 0 otherwise
|
||||
*/
|
||||
protected int mergeDanglingTail(final DanglingTailMergeResult danglingTailMergeResult) {
|
||||
|
||||
final List<CigarElement> elements = danglingTailMergeResult.cigar.getCigarElements();
|
||||
final CigarElement lastElement = elements.get(elements.size() - 1);
|
||||
if ( lastElement.getOperator() != CigarOperator.M )
|
||||
throw new IllegalArgumentException("The last Cigar element must be an M");
|
||||
|
||||
final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1;
|
||||
final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength());
|
||||
if ( matchingSuffix == 0 )
|
||||
return 0;
|
||||
|
||||
final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0);
|
||||
final int refIndexToMerge = lastRefIndex - matchingSuffix + 1;
|
||||
addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1));
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
|
||||
* provided vertex is the sink) and the reference path.
|
||||
*
|
||||
* @param vertex the sink of the dangling tail
|
||||
* @return a SmithWaterman object which can be null if no proper alignment could be generated
|
||||
*/
|
||||
protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex) {
|
||||
|
||||
// find the lowest common ancestor path between vertex and the reference sink if available
|
||||
final List<MultiDeBruijnVertex> altPath = findPathToLowestCommonAncestorOfReference(vertex);
|
||||
if ( altPath == null || isRefSource(altPath.get(0)) )
|
||||
return null;
|
||||
|
||||
// now get the reference path from the LCA
|
||||
final List<MultiDeBruijnVertex> refPath = getReferencePath(altPath.get(0));
|
||||
|
||||
// create the Smith-Waterman strings to use
|
||||
final byte[] refBases = getBasesForPath(refPath);
|
||||
final byte[] altBases = getBasesForPath(altPath);
|
||||
|
||||
// run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting)
|
||||
final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWPairwiseAlignment.OVERHANG_STRATEGY.INDEL);
|
||||
return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex
|
||||
*
|
||||
* @param vertex the original vertex
|
||||
* @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
|
||||
* has an ancestor with multiple incoming edges before hitting the reference path
|
||||
*/
|
||||
protected List<MultiDeBruijnVertex> findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) {
|
||||
final LinkedList<MultiDeBruijnVertex> path = new LinkedList<>();
|
||||
|
||||
MultiDeBruijnVertex v = vertex;
|
||||
while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) {
|
||||
path.addFirst(v);
|
||||
v = getEdgeSource(incomingEdgeOf(v));
|
||||
}
|
||||
path.addFirst(v);
|
||||
|
||||
return isReferenceNode(v) ? path : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the path downwards in the graph from this vertex to the reference sink, including this vertex
|
||||
*
|
||||
* @param start the reference vertex to start from
|
||||
* @return the path (non-null, non-empty)
|
||||
*/
|
||||
protected List<MultiDeBruijnVertex> getReferencePath(final MultiDeBruijnVertex start) {
|
||||
if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path");
|
||||
|
||||
final List<MultiDeBruijnVertex> path = new ArrayList<>();
|
||||
|
||||
MultiDeBruijnVertex v = start;
|
||||
while ( v != null ) {
|
||||
path.add(v);
|
||||
v = getNextReferenceVertex(v);
|
||||
}
|
||||
|
||||
return null;
|
||||
return path;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -297,7 +412,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
public void buildGraphIfNecessary() {
|
||||
if ( alreadyBuilt ) return;
|
||||
|
||||
// determine the kmer size we'll uses, and capture the set of nonUniques for that kmer size
|
||||
// determine the kmer size we'll use, and capture the set of nonUniques for that kmer size
|
||||
final NonUniqueResult result = determineKmerSizeAndNonUniques(kmerSize, kmerSize);
|
||||
nonUniqueKmers = result.nonUniques;
|
||||
|
||||
|
|
@ -321,6 +436,23 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
alreadyBuilt = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if the graph has cycles, false otherwise
|
||||
*/
|
||||
public boolean hasCycles() {
|
||||
return new CycleDetector<>(this).detectCycles();
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the graph not have enough complexity? We define low complexity as a situation where the number
|
||||
* of non-unique kmers is more than 20% of the total number of kmers.
|
||||
*
|
||||
* @return true if the graph has low complexity, false otherwise
|
||||
*/
|
||||
public boolean isLowComplexity() {
|
||||
return nonUniqueKmers.size() * 4 > uniqueKmers.size();
|
||||
}
|
||||
|
||||
public void recoverDanglingTails() {
|
||||
if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
|
||||
|
||||
|
|
@ -332,7 +464,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
nRecovered += recoverDanglingChain(v);
|
||||
}
|
||||
}
|
||||
//logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails");
|
||||
|
||||
if ( debugGraphTransformations ) logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails");
|
||||
}
|
||||
|
||||
/** structure that keeps track of the non-unique kmers for a given kmer size */
|
||||
|
|
@ -409,7 +542,8 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
private Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) {
|
||||
// count up occurrences of kmers within each read
|
||||
final KMerCounter counter = new KMerCounter(kmerSize);
|
||||
for ( int i = 0; i <= seqForKmers.stop - kmerSize; i++ ) {
|
||||
final int stopPosition = seqForKmers.stop - kmerSize;
|
||||
for ( int i = 0; i <= stopPosition; i++ ) {
|
||||
final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
|
||||
counter.addKmer(kmer, 1);
|
||||
}
|
||||
|
|
@ -578,23 +712,22 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
|
||||
// none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
|
||||
final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize);
|
||||
MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
|
||||
final MultiDeBruijnVertex uniqueMergeVertex = getUniqueKmerVertex(kmer, false);
|
||||
|
||||
if ( isRef && uniqueMergeVertex != null )
|
||||
throw new IllegalStateException("Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex);
|
||||
|
||||
// either use our unique merge vertex, or create a new one in the chain
|
||||
final MultiDeBruijnVertex nextVertex = uniqueMergeVertex == null ? createVertex(kmer) : uniqueMergeVertex;
|
||||
addEdge(prevVertex, nextVertex, new MultiSampleEdge(isRef, count));
|
||||
addEdge(prevVertex, nextVertex, ((MyEdgeFactory)getEdgeFactory()).createEdge(isRef, count));
|
||||
return nextVertex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the start and stop positions (exclusive) of the longest stretch of high quality bases
|
||||
* in read
|
||||
* Add the given read to the sequence graph. Ultimately the read will get sent through addSequence(), but first
|
||||
* this method ensures we only use high quality bases and accounts for reduced reads, etc.
|
||||
*
|
||||
* @param read a non-null read
|
||||
* @return the start and stop for high quality bases in read, or null if none exist
|
||||
*/
|
||||
protected void addRead(final GATKSAMRecord read) {
|
||||
final byte[] sequence = read.getReadBases();
|
||||
|
|
@ -603,7 +736,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
|
||||
int lastGood = -1; // the index of the last good base we've seen
|
||||
for( int end = 0; end <= sequence.length; end++ ) {
|
||||
if ( end == sequence.length || qualities[end] < minBaseQualityToUseInAssembly ) {
|
||||
if ( end == sequence.length || ! baseIsUsableForAssembly(sequence[end], qualities[end]) ) {
|
||||
// the first good base is at lastGood, can be -1 if last base was bad
|
||||
final int start = lastGood;
|
||||
// the stop base is end - 1 (if we're not at the end of the sequence)
|
||||
|
|
@ -623,6 +756,18 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether a base can safely be used for assembly.
|
||||
* Currently disallows Ns and/or those with low quality
|
||||
*
|
||||
* @param base the base under consideration
|
||||
* @param qual the quality of that base
|
||||
* @return true if the base can be used for assembly, false otherwise
|
||||
*/
|
||||
protected boolean baseIsUsableForAssembly(final byte base, final byte qual) {
|
||||
return base != BaseUtils.Base.N.base && qual >= minBaseQualityToUseInAssembly;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of non-unique kmers in this graph. For debugging purposes
|
||||
* @return a non-null set of kmers
|
||||
|
|
|
|||
|
|
@ -130,7 +130,7 @@ public class ConstrainedMateFixingManager {
|
|||
private static final boolean DEBUG = false;
|
||||
|
||||
/** How often do we check whether we want to emit reads? */
|
||||
private final static int EMIT_FREQUENCY = 1000;
|
||||
protected final static int EMIT_FREQUENCY = 1000;
|
||||
|
||||
/**
|
||||
* How much could a single read move in position from its original position?
|
||||
|
|
@ -324,7 +324,8 @@ public class ConstrainedMateFixingManager {
|
|||
|| noReadCanMoveBefore(read.getMateAlignmentStart(), newRead ) ) ) { // we're already past where the mate started
|
||||
|
||||
// remove reads from the map that we have emitted -- useful for case where the mate never showed up
|
||||
forMateMatching.remove(read.getReadName());
|
||||
if ( !read.getNotPrimaryAlignmentFlag() )
|
||||
forMateMatching.remove(read.getReadName());
|
||||
|
||||
if ( DEBUG )
|
||||
logger.warn(String.format("EMIT! At %d: read %s at %d with isize %d, mate start %d, op = %s",
|
||||
|
|
@ -346,7 +347,8 @@ public class ConstrainedMateFixingManager {
|
|||
|
||||
private void writeRead(SAMRecord read) {
|
||||
try {
|
||||
writer.addAlignment(read);
|
||||
if ( writer != null )
|
||||
writer.addAlignment(read);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new UserException("If the maximum allowable reads in memory is too small, it may cause reads to be written out of order when trying to write the BAM; please see the --maxReadsInMemory argument for details. " + e.getMessage(), e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,6 +54,7 @@ import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -78,8 +79,6 @@ public class PairHMMIndelErrorModel {
|
|||
private static final double baseMatchArray[];
|
||||
private static final double baseMismatchArray[];
|
||||
|
||||
private final static double LOG_ONE_HALF;
|
||||
|
||||
private static final int START_HRUN_GAP_IDX = 4;
|
||||
private static final int MAX_HRUN_GAP_IDX = 20;
|
||||
|
||||
|
|
@ -97,8 +96,6 @@ public class PairHMMIndelErrorModel {
|
|||
/////////////////////////////
|
||||
|
||||
static {
|
||||
LOG_ONE_HALF= -Math.log10(2.0);
|
||||
|
||||
baseMatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
|
||||
|
|
@ -120,12 +117,11 @@ public class PairHMMIndelErrorModel {
|
|||
case ORIGINAL:
|
||||
pairHMM = new Log10PairHMM(false);
|
||||
break;
|
||||
case LOGLESS_CACHING: //TODO: still not tested so please do not use yet
|
||||
//pairHMM = new LoglessCachingPairHMM(); //TODO - add it back when the figure out how to use the protected LoglessCachingPairHMM class
|
||||
throw new UserException.BadArgumentValue("pairHMM"," this option (LOGLESS_CACHING in UG) is still under development");
|
||||
//break;
|
||||
case LOGLESS_CACHING:
|
||||
pairHMM = new LoglessPairHMM();
|
||||
break;
|
||||
default:
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT or LOGLESS_CACHING (the third option is still under development).");
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT or LOGLESS_CACHING.");
|
||||
}
|
||||
|
||||
// fill gap penalty table, affine naive model:
|
||||
|
|
@ -466,7 +462,7 @@ public class PairHMMIndelErrorModel {
|
|||
final double li = readLikelihoods[readIdx][i];
|
||||
final double lj = readLikelihoods[readIdx][j];
|
||||
final int readCount = readCounts[readIdx];
|
||||
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + LOG_ONE_HALF);
|
||||
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -89,7 +90,7 @@ import java.util.List;
|
|||
*
|
||||
* @author ami
|
||||
*/
|
||||
|
||||
@Hidden
|
||||
public class AssessReducedQuals extends LocusWalker<GenomeLoc, GenomeLoc> implements TreeReducible<GenomeLoc> {
|
||||
|
||||
private static final String reduced = "reduced";
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import Jama.Matrix;
|
||||
import cern.jet.random.Normal;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -226,6 +225,20 @@ public class GaussianMixtureModel {
|
|||
isModelReadyForEvaluation = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* A version of Log10SumLog10 that tolerates NaN values in the array
|
||||
*
|
||||
* In the case where one or more of the values are NaN, this function returns NaN
|
||||
*
|
||||
* @param values a non-null vector of doubles
|
||||
* @return log10 of the sum of the log10 values, or NaN
|
||||
*/
|
||||
private double nanTolerantLog10SumLog10(final double[] values) {
|
||||
for ( final double value : values )
|
||||
if ( Double.isNaN(value) ) return Double.NaN;
|
||||
return MathUtils.log10sumLog10(values);
|
||||
}
|
||||
|
||||
public double evaluateDatum( final VariantDatum datum ) {
|
||||
for( final boolean isNull : datum.isNull ) {
|
||||
if( isNull ) { return evaluateDatumMarginalized( datum ); }
|
||||
|
|
@ -236,21 +249,19 @@ public class GaussianMixtureModel {
|
|||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
||||
}
|
||||
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
return nanTolerantLog10SumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
}
|
||||
|
||||
// Used only to decide which covariate dimension is most divergent in order to report in the culprit info field annotation
|
||||
public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) {
|
||||
if(datum.isNull[iii]) { return null; }
|
||||
|
||||
final Normal normal = new Normal(0.0, 1.0, null);
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
int gaussianIndex = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) );
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) );
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + MathUtils.normalDistributionLog10(gaussian.mu[iii], gaussian.sigma.get(iii, iii), datum.annotations[iii]);
|
||||
}
|
||||
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
return nanTolerantLog10SumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
}
|
||||
|
||||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,298 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.gvcf;
|
||||
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Genome-wide VCF writer
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 6/24/13
|
||||
* Time: 2:51 PM
|
||||
*/
|
||||
public class GVCFWriter implements VariantContextWriter {
|
||||
//
|
||||
// static VCF field names
|
||||
//
|
||||
protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE";
|
||||
protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP";
|
||||
protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ";
|
||||
|
||||
//
|
||||
// Final fields initialized in constructor
|
||||
//
|
||||
/** Where we'll ultimately write our VCF records */
|
||||
final private VariantContextWriter underlyingWriter;
|
||||
|
||||
final private List<HomRefBlock> GQPartitions;
|
||||
|
||||
/** fields updated on the fly during GVCFWriter operation */
|
||||
int nextAvailableStart = -1;
|
||||
private String sampleName = null;
|
||||
private HomRefBlock currentBlock = null;
|
||||
|
||||
/**
|
||||
* Is the proposed GQ partitions well-formed?
|
||||
*
|
||||
* @param GQPartitions proposed GQ partitions
|
||||
* @return a non-null string if something is wrong (string explains issue)
|
||||
*/
|
||||
protected static List<HomRefBlock> parsePartitions(final List<Integer> GQPartitions) {
|
||||
if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null");
|
||||
if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty");
|
||||
|
||||
final List<HomRefBlock> result = new LinkedList<>();
|
||||
int lastThreshold = 0;
|
||||
for ( final Integer value : GQPartitions ) {
|
||||
if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer");
|
||||
if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value);
|
||||
if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value);
|
||||
result.add(new HomRefBlock(lastThreshold, value));
|
||||
lastThreshold = value;
|
||||
}
|
||||
result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GVCF writer
|
||||
*
|
||||
* Should be a non-empty list of boundaries. For example, suppose this variable is
|
||||
*
|
||||
* [A, B, C]
|
||||
*
|
||||
* We would partition our hom-ref sites into the following bands:
|
||||
*
|
||||
* X < A
|
||||
* A <= X < B
|
||||
* B <= X < C
|
||||
* X >= C
|
||||
*
|
||||
* @param underlyingWriter the ultimate destination of the GVCF records
|
||||
* @param GQPartitions a well-formed list of GQ partitions
|
||||
*/
|
||||
public GVCFWriter(final VariantContextWriter underlyingWriter, final List<Integer> GQPartitions) {
|
||||
if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null");
|
||||
this.underlyingWriter = underlyingWriter;
|
||||
this.GQPartitions = parsePartitions(GQPartitions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the VCF header
|
||||
*
|
||||
* Adds standard GVCF fields to the header
|
||||
*
|
||||
* @param header a non-null header
|
||||
*/
|
||||
@Override
|
||||
public void writeHeader(VCFHeader header) {
|
||||
if ( header == null ) throw new IllegalArgumentException("header cannot be null");
|
||||
header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
|
||||
header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block"));
|
||||
header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block"));
|
||||
header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block"));
|
||||
|
||||
for ( final HomRefBlock partition : GQPartitions ) {
|
||||
header.addMetaDataLine(partition.toVCFHeaderLine());
|
||||
}
|
||||
|
||||
underlyingWriter.writeHeader(header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close this GVCF writer. Finalizes any pending hom-ref blocks and emits those to the underlyingWriter as well
|
||||
*/
|
||||
@Override
|
||||
public void close() {
|
||||
close(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Horrible work around because there's no clean way to get our VCFWriter closed by the GATK
|
||||
*
|
||||
* If closeUnderlyingWriter is true, then we'll close the underlying writer, otherwise we'll leave it open
|
||||
* so the GATK closes it later
|
||||
*
|
||||
* @param closeUnderlyingWriter should we leave the underlying writer open or closed?
|
||||
*/
|
||||
public void close(final boolean closeUnderlyingWriter) {
|
||||
emitCurrentBlock();
|
||||
if ( closeUnderlyingWriter ) underlyingWriter.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add hom-ref site from vc to this gVCF hom-ref state tracking, emitting any pending states if appropriate
|
||||
*
|
||||
* @param vc a non-null VariantContext
|
||||
* @param g a non-null genotype from VariantContext
|
||||
* @return a VariantContext to be emitted, or null if non is appropriate
|
||||
*/
|
||||
protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) {
|
||||
if ( nextAvailableStart != -1 && vc.getStart() <= nextAvailableStart ) {
|
||||
// don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions)
|
||||
return null;
|
||||
} else if ( currentBlock == null ) {
|
||||
currentBlock = createNewBlock(vc, g);
|
||||
return null;
|
||||
} else if ( currentBlock.withinBounds(g.getGQ()) ) {
|
||||
currentBlock.add(vc.getStart(), g);
|
||||
return null;
|
||||
} else {
|
||||
final VariantContext result = blockToVCF(currentBlock);
|
||||
currentBlock = createNewBlock(vc, g);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null
|
||||
*/
|
||||
private void emitCurrentBlock() {
|
||||
if ( currentBlock != null ) {
|
||||
// there's actually some work to do
|
||||
underlyingWriter.add(blockToVCF(currentBlock));
|
||||
currentBlock = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a HomRefBlock into a VariantContext
|
||||
*
|
||||
* @param block the block to convert
|
||||
* @return a VariantContext representing the gVCF encoding for this block
|
||||
*/
|
||||
private VariantContext blockToVCF(final HomRefBlock block) {
|
||||
if ( block == null ) throw new IllegalArgumentException("block cannot be null");
|
||||
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC());
|
||||
vcb.attributes(new HashMap<String, Object>(2)); // clear the attributes
|
||||
vcb.stop(block.getStop());
|
||||
vcb.attribute(VCFConstants.END_KEY, block.getStop());
|
||||
vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize());
|
||||
|
||||
// create the single Genotype with GQ and DP annotations
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef()));
|
||||
gb.noAD().noPL().noAttributes(); // clear all attributes
|
||||
gb.GQ(block.getMedianGQ());
|
||||
gb.DP(block.getMedianDP());
|
||||
gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP());
|
||||
gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ());
|
||||
|
||||
return vcb.genotypes(gb.make()).make();
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to create a new HomRefBlock from a variant context and current genotype
|
||||
*
|
||||
* @param vc the VariantContext at the site where want to start the band
|
||||
* @param g the genotype of the sample from vc that should be used to initialize the block
|
||||
* @return a newly allocated and initialized block containing g already
|
||||
*/
|
||||
private HomRefBlock createNewBlock(final VariantContext vc, final Genotype g) {
|
||||
// figure out the GQ limits to use based on the GQ of g
|
||||
HomRefBlock partition = null;
|
||||
for ( final HomRefBlock maybePartition : GQPartitions ) {
|
||||
if ( maybePartition.withinBounds(g.getGQ()) ) {
|
||||
partition = maybePartition;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition);
|
||||
|
||||
// create the block, add g to it, and return it for use
|
||||
final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound());
|
||||
block.add(vc.getStart(), g);
|
||||
return block;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a VariantContext to this writer for emission
|
||||
*
|
||||
* Requires that the VC have exactly one genotype
|
||||
*
|
||||
* @param vc a non-null VariantContext
|
||||
*/
|
||||
@Override
|
||||
public void add(VariantContext vc) {
|
||||
if ( vc == null ) throw new IllegalArgumentException("vc cannot be null");
|
||||
|
||||
if ( sampleName == null )
|
||||
sampleName = vc.getGenotype(0).getSampleName();
|
||||
|
||||
if ( ! vc.hasGenotypes() ) {
|
||||
throw new IllegalArgumentException("GVCF assumes that the VariantContext has genotypes");
|
||||
} else if ( vc.getGenotypes().size() != 1 ) {
|
||||
throw new IllegalArgumentException("GVCF assumes that the VariantContext has exactly one genotype but saw " + vc.getGenotypes().size());
|
||||
} else {
|
||||
if ( currentBlock != null && ! currentBlock.isContiguous(vc) ) {
|
||||
// we've made a non-contiguous step (across interval, onto another chr), so finalize
|
||||
emitCurrentBlock();
|
||||
}
|
||||
|
||||
final Genotype g = vc.getGenotype(0);
|
||||
if ( g.isHomRef() ) {
|
||||
// create bands
|
||||
final VariantContext maybeCompletedBand = addHomRefSite(vc, g);
|
||||
if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand);
|
||||
} else {
|
||||
// g is variant, so flush the bands and emit vc
|
||||
emitCurrentBlock();
|
||||
nextAvailableStart = vc.getEnd();
|
||||
underlyingWriter.add(vc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.gvcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Helper class for calculating a GQ band in the GVCF writer
|
||||
*
|
||||
* A band contains GQ and DP values for a contiguous stretch of hom-ref genotypes,
|
||||
* and provides summary information about the entire block of genotypes.
|
||||
*
|
||||
* Genotypes within the HomRefBlock are restricted to hom-ref genotypes within a band of GQ scores
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 6/25/13
|
||||
* Time: 9:41 AM
|
||||
*/
|
||||
final class HomRefBlock {
|
||||
private final VariantContext startingVC;
|
||||
int stop;
|
||||
private final int minGQ, maxGQ;
|
||||
private List<Integer> GQs = new ArrayList<>(100);
|
||||
private List<Integer> DPs = new ArrayList<>(100);
|
||||
private final Allele ref;
|
||||
|
||||
/**
|
||||
* Create a new HomRefBlock
|
||||
*
|
||||
* @param startingVC the VariantContext that starts this band (for starting position information)
|
||||
* @param minGQ the minGQ (inclusive) to use in this band
|
||||
* @param maxGQ the maxGQ (exclusive) to use in this band
|
||||
*/
|
||||
public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) {
|
||||
if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null");
|
||||
if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ);
|
||||
|
||||
this.startingVC = startingVC;
|
||||
this.stop = getStart() - 1;
|
||||
this.ref = startingVC.getReference();
|
||||
this.minGQ = minGQ;
|
||||
this.maxGQ = maxGQ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new HomRefBlock only for doing bounds checking
|
||||
*
|
||||
* @param minGQ the minGQ (inclusive) to use in this band
|
||||
* @param maxGQ the maxGQ (exclusive) to use in this band
|
||||
*/
|
||||
public HomRefBlock(int minGQ, int maxGQ) {
|
||||
if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ);
|
||||
|
||||
this.startingVC = null;
|
||||
this.stop = -1;
|
||||
this.ref = null;
|
||||
this.minGQ = minGQ;
|
||||
this.maxGQ = maxGQ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add information from this Genotype to this band
|
||||
* @param g a non-null Genotype with GQ and DP attributes
|
||||
*/
|
||||
public void add(final int pos, final Genotype g) {
|
||||
if ( g == null ) throw new IllegalArgumentException("g cannot be null");
|
||||
if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field");
|
||||
if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field");
|
||||
if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop);
|
||||
|
||||
stop = pos;
|
||||
GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission
|
||||
DPs.add(g.getDP());
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the GQ value within the bounds of this GQ (GQ >= minGQ && GQ < maxGQ)
|
||||
* @param GQ the GQ value to test
|
||||
* @return true if within bounds, false otherwise
|
||||
*/
|
||||
public boolean withinBounds(final int GQ) {
|
||||
return GQ >= minGQ && GQ < maxGQ;
|
||||
}
|
||||
|
||||
/** Get the min GQ observed within this band */
|
||||
public int getMinGQ() { return MathUtils.arrayMin(GQs); }
|
||||
/** Get the median GQ observed within this band */
|
||||
public int getMedianGQ() { return MathUtils.median(GQs); }
|
||||
/** Get the min DP observed within this band */
|
||||
public int getMinDP() { return MathUtils.arrayMin(DPs); }
|
||||
/** Get the median DP observed within this band */
|
||||
public int getMedianDP() { return MathUtils.median(DPs); }
|
||||
|
||||
protected int getGQUpperBound() { return maxGQ; }
|
||||
protected int getGQLowerBound() { return minGQ; }
|
||||
|
||||
public boolean isContiguous(final VariantContext vc) {
|
||||
return vc.getEnd() == getStop() + 1 && startingVC.getChr().equals(vc.getChr());
|
||||
}
|
||||
|
||||
public VariantContext getStartingVC() { return startingVC; }
|
||||
public int getStart() { return startingVC.getStart(); }
|
||||
public int getStop() { return stop; }
|
||||
public Allele getRef() { return ref; }
|
||||
public int getSize() { return getStop() - getStart() + 1; }
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "HomRefBlock{" +
|
||||
"minGQ=" + minGQ +
|
||||
", maxGQ=" + maxGQ +
|
||||
'}';
|
||||
}
|
||||
|
||||
public VCFHeaderLine toVCFHeaderLine() {
|
||||
return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)");
|
||||
}
|
||||
}
|
||||
|
|
@ -46,11 +46,10 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
|
|
@ -67,31 +66,31 @@ import java.util.*;
|
|||
* Time: 1:50 PM
|
||||
*/
|
||||
class AllHaplotypeBAMWriter extends HaplotypeBAMWriter {
|
||||
public AllHaplotypeBAMWriter(final SAMFileWriter bamWriter) {
|
||||
super(bamWriter);
|
||||
public AllHaplotypeBAMWriter(final ReadDestination destination) {
|
||||
super(destination);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void writeReadsAlignedToHaplotypes(final List<Haplotype> haplotypes,
|
||||
public void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
|
||||
final GenomeLoc paddedReferenceLoc,
|
||||
final List<Haplotype> bestHaplotypes,
|
||||
final Collection<Haplotype> bestHaplotypes,
|
||||
final Set<Haplotype> calledHaplotypes,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
||||
writeHaplotypesAsReads(haplotypes, new HashSet<Haplotype>(bestHaplotypes), paddedReferenceLoc);
|
||||
writeHaplotypesAsReads(haplotypes, new HashSet<>(bestHaplotypes), paddedReferenceLoc);
|
||||
|
||||
// we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently
|
||||
final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<Allele, Haplotype>(haplotypes.size());
|
||||
final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<>(haplotypes.size());
|
||||
for ( final Haplotype haplotype : haplotypes )
|
||||
alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype);
|
||||
|
||||
// next, output the interesting reads for each sample aligned against the appropriate haplotype
|
||||
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
|
||||
for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart());
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,17 +68,17 @@ import java.util.*;
|
|||
* Time: 1:50 PM
|
||||
*/
|
||||
class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
|
||||
public CalledHaplotypeBAMWriter(final SAMFileWriter bamWriter) {
|
||||
super(bamWriter);
|
||||
public CalledHaplotypeBAMWriter(final ReadDestination destination) {
|
||||
super(destination);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void writeReadsAlignedToHaplotypes(final List<Haplotype> haplotypes,
|
||||
public void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
|
||||
final GenomeLoc paddedReferenceLoc,
|
||||
final List<Haplotype> bestHaplotypes,
|
||||
final Collection<Haplotype> bestHaplotypes,
|
||||
final Set<Haplotype> calledHaplotypes,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
||||
if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes
|
||||
|
|
@ -87,7 +87,7 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
|
|||
writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc);
|
||||
|
||||
// we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently
|
||||
final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<Allele, Haplotype>(haplotypes.size());
|
||||
final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<>(haplotypes.size());
|
||||
for ( final Haplotype haplotype : calledHaplotypes ) {
|
||||
alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype);
|
||||
}
|
||||
|
|
@ -97,11 +97,9 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
|
|||
|
||||
// next, output the interesting reads for each sample aligned against one of the called haplotypes
|
||||
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
|
||||
for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
if ( entry.getKey().getMappingQuality() > 0 ) {
|
||||
final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart());
|
||||
}
|
||||
for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,16 +46,18 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMTag;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -75,8 +77,8 @@ public abstract class HaplotypeBAMWriter {
|
|||
protected final static String READ_GROUP_ID = "ArtificialHaplotype";
|
||||
protected final static String HAPLOTYPE_TAG = "HC";
|
||||
|
||||
final SAMFileWriter bamWriter;
|
||||
final SAMFileHeader bamHeader;
|
||||
final ReadDestination output;
|
||||
boolean writeHaplotypesAsWell = true;
|
||||
|
||||
/**
|
||||
* Possible modes for writing haplotypes to BAMs
|
||||
|
|
@ -104,27 +106,10 @@ public abstract class HaplotypeBAMWriter {
|
|||
* @return a new HaplotypeBAMWriter
|
||||
*/
|
||||
public static HaplotypeBAMWriter create(final Type type, final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header) {
|
||||
if ( header == null ) throw new IllegalArgumentException("header cannot be null");
|
||||
if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null");
|
||||
if ( type == null ) throw new IllegalArgumentException("type cannot be null");
|
||||
|
||||
// prepare the bam header
|
||||
final SAMFileHeader bamHeader = new SAMFileHeader();
|
||||
bamHeader.setSequenceDictionary(header.getSequenceDictionary());
|
||||
bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
||||
|
||||
// include the original read groups plus a new artificial one for the haplotypes
|
||||
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(header.getReadGroups());
|
||||
final SAMReadGroupRecord rg = new SAMReadGroupRecord(READ_GROUP_ID);
|
||||
rg.setSample("HC");
|
||||
rg.setSequencingCenter("BI");
|
||||
readGroups.add(rg);
|
||||
bamHeader.setReadGroups(readGroups);
|
||||
|
||||
// TODO -- this will be a performance problem at high-scale
|
||||
stingSAMWriter.setPresorted(false);
|
||||
stingSAMWriter.writeHeader(bamHeader);
|
||||
return create(type, stingSAMWriter);
|
||||
final ReadDestination toBam = new ReadDestination.ToBAM(stingSAMWriter, header, READ_GROUP_ID);
|
||||
return create(type, toBam);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -134,16 +119,16 @@ public abstract class HaplotypeBAMWriter {
|
|||
* may come in out of order during writing
|
||||
*
|
||||
* @param type the type of the writer we want to create
|
||||
* @param writer the destination, must not be null
|
||||
* @param destination the destination, must not be null
|
||||
* @return a new HaplotypeBAMWriter
|
||||
*/
|
||||
public static HaplotypeBAMWriter create(final Type type, final SAMFileWriter writer) {
|
||||
if ( writer == null ) throw new IllegalArgumentException("writer cannot be null");
|
||||
public static HaplotypeBAMWriter create(final Type type, final ReadDestination destination) {
|
||||
if ( destination == null ) throw new IllegalArgumentException("writer cannot be null");
|
||||
if ( type == null ) throw new IllegalArgumentException("type cannot be null");
|
||||
|
||||
switch ( type ) {
|
||||
case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(writer);
|
||||
case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(writer);
|
||||
case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(destination);
|
||||
case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(destination);
|
||||
default: throw new IllegalArgumentException("Unknown type " + type);
|
||||
}
|
||||
}
|
||||
|
|
@ -154,11 +139,10 @@ public abstract class HaplotypeBAMWriter {
|
|||
* Assumes that the header has been fully initialized with a single
|
||||
* read group READ_GROUP_ID
|
||||
*
|
||||
* @param bamWriter our output destination
|
||||
* @param output our output destination
|
||||
*/
|
||||
protected HaplotypeBAMWriter(SAMFileWriter bamWriter) {
|
||||
this.bamWriter = bamWriter;
|
||||
this.bamHeader = bamWriter.getFileHeader();
|
||||
protected HaplotypeBAMWriter(final ReadDestination output) {
|
||||
this.output = output;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -170,12 +154,18 @@ public abstract class HaplotypeBAMWriter {
|
|||
* @param calledHaplotypes a list of the haplotypes at where actually called as non-reference
|
||||
* @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes
|
||||
*/
|
||||
public abstract void writeReadsAlignedToHaplotypes(final List<Haplotype> haplotypes,
|
||||
public abstract void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
|
||||
final GenomeLoc paddedReferenceLoc,
|
||||
final List<Haplotype> bestHaplotypes,
|
||||
final Collection<Haplotype> bestHaplotypes,
|
||||
final Set<Haplotype> calledHaplotypes,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap);
|
||||
|
||||
public void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
|
||||
final GenomeLoc paddedReferenceLoc,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
||||
writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, haplotypes, new HashSet<>(haplotypes), stratifiedReadMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out read aligned to haplotype to the BAM file
|
||||
*
|
||||
|
|
@ -185,13 +175,15 @@ public abstract class HaplotypeBAMWriter {
|
|||
* @param originalRead the read we want to write aligned to the reference genome
|
||||
* @param haplotype the haplotype that the read should be aligned to, before aligning to the reference
|
||||
* @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame.
|
||||
* @param isInformative true if the read is differentially informative for one of the haplotypes
|
||||
*/
|
||||
protected void writeReadAgainstHaplotype(final GATKSAMRecord originalRead,
|
||||
final Haplotype haplotype,
|
||||
final int referenceStart) {
|
||||
final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart);
|
||||
final int referenceStart,
|
||||
final boolean isInformative) {
|
||||
final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative);
|
||||
if ( alignedToRef != null )
|
||||
bamWriter.addAlignment(alignedToRef);
|
||||
output.add(alignedToRef);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -201,11 +193,13 @@ public abstract class HaplotypeBAMWriter {
|
|||
* @param originalRead the read we want to write aligned to the reference genome
|
||||
* @param haplotype the haplotype that the read should be aligned to, before aligning to the reference
|
||||
* @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame.
|
||||
* @param isInformative true if the read is differentially informative for one of the haplotypes
|
||||
* @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible
|
||||
*/
|
||||
protected GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead,
|
||||
final Haplotype haplotype,
|
||||
final int referenceStart) {
|
||||
final int referenceStart,
|
||||
final boolean isInformative) {
|
||||
if ( originalRead == null ) throw new IllegalArgumentException("originalRead cannot be null");
|
||||
if ( haplotype == null ) throw new IllegalArgumentException("haplotype cannot be null");
|
||||
if ( haplotype.getCigar() == null ) throw new IllegalArgumentException("Haplotype cigar not set " + haplotype);
|
||||
|
|
@ -225,6 +219,10 @@ public abstract class HaplotypeBAMWriter {
|
|||
|
||||
addHaplotypeTag(read, haplotype);
|
||||
|
||||
// uninformative reads are set to zero mapping quality to enhance visualization
|
||||
if ( !isInformative )
|
||||
read.setMappingQuality(0);
|
||||
|
||||
// compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar
|
||||
final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000);
|
||||
final int readStartOnHaplotype = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1());
|
||||
|
|
@ -273,8 +271,9 @@ public abstract class HaplotypeBAMWriter {
|
|||
protected void writeHaplotypesAsReads(final Collection<Haplotype> haplotypes,
|
||||
final Set<Haplotype> bestHaplotypes,
|
||||
final GenomeLoc paddedReferenceLoc) {
|
||||
for ( final Haplotype haplotype : haplotypes )
|
||||
writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype));
|
||||
if ( isWriteHaplotypesAsWell() )
|
||||
for ( final Haplotype haplotype : haplotypes )
|
||||
writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -287,7 +286,7 @@ public abstract class HaplotypeBAMWriter {
|
|||
private void writeHaplotype(final Haplotype haplotype,
|
||||
final GenomeLoc paddedRefLoc,
|
||||
final boolean isAmongBestHaplotypes) {
|
||||
final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
|
||||
final GATKSAMRecord record = new GATKSAMRecord(output.getHeader());
|
||||
record.setReadBases(haplotype.getBases());
|
||||
record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
|
||||
record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
|
||||
|
|
@ -299,6 +298,14 @@ public abstract class HaplotypeBAMWriter {
|
|||
record.setReferenceIndex(paddedRefLoc.getContigIndex());
|
||||
record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID);
|
||||
record.setFlags(16);
|
||||
bamWriter.addAlignment(record);
|
||||
output.add(record);
|
||||
}
|
||||
|
||||
public boolean isWriteHaplotypesAsWell() {
|
||||
return writeHaplotypesAsWell;
|
||||
}
|
||||
|
||||
public void setWriteHaplotypesAsWell(boolean writeHaplotypesAsWell) {
|
||||
this.writeHaplotypesAsWell = writeHaplotypesAsWell;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,135 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Utility class that allows us to easily create destinations for the HaplotypeBAMWriters
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 6/19/13
|
||||
* Time: 10:19 AM
|
||||
*/
|
||||
public abstract class ReadDestination {
|
||||
public abstract void add(final GATKSAMRecord read);
|
||||
|
||||
private final SAMFileHeader bamHeader;
|
||||
|
||||
public SAMFileHeader getHeader() {
|
||||
return bamHeader;
|
||||
}
|
||||
|
||||
protected ReadDestination(final SAMFileHeader header, final String readGroupID) {
|
||||
// prepare the bam header
|
||||
if ( header == null ) throw new IllegalArgumentException("header cannot be null");
|
||||
bamHeader = new SAMFileHeader();
|
||||
bamHeader.setSequenceDictionary(header.getSequenceDictionary());
|
||||
bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
||||
|
||||
// include the original read groups plus a new artificial one for the haplotypes
|
||||
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(header.getReadGroups());
|
||||
final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupID);
|
||||
rg.setSample("HC");
|
||||
rg.setSequencingCenter("BI");
|
||||
readGroups.add(rg);
|
||||
bamHeader.setReadGroups(readGroups);
|
||||
}
|
||||
|
||||
public static class ToBAM extends ReadDestination {
|
||||
final SAMFileWriter bamWriter;
|
||||
|
||||
/**
|
||||
* Create a ReadDestination that writes to a BAM file
|
||||
*/
|
||||
public ToBAM(final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header, final String readGroupID) {
|
||||
super(header, readGroupID);
|
||||
if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null");
|
||||
|
||||
bamWriter = stingSAMWriter;
|
||||
stingSAMWriter.setPresorted(false);
|
||||
stingSAMWriter.writeHeader(getHeader());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(GATKSAMRecord read) {
|
||||
bamWriter.addAlignment(read);
|
||||
}
|
||||
}
|
||||
|
||||
public static class ToList extends ReadDestination {
|
||||
final List<GATKSAMRecord> reads = new LinkedList<>();
|
||||
|
||||
/**
|
||||
* Create a ReadDestination that captures the output reads in a list of reads
|
||||
*/
|
||||
public ToList(SAMFileHeader header, String readGroupID) {
|
||||
super(header, readGroupID);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(GATKSAMRecord read) {
|
||||
reads.add(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the reads that have been written to this destination
|
||||
* @return a non-null list of reads
|
||||
*/
|
||||
public List<GATKSAMRecord> getReads() {
|
||||
return reads;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -55,7 +55,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
* User: rpoplin, carneiro
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
public final class LoglessPairHMM extends PairHMM {
|
||||
public final class LoglessPairHMM extends N2MemoryPairHMM {
|
||||
protected static final double INITIAL_CONDITION = Math.pow(2, 1020);
|
||||
protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION);
|
||||
|
||||
|
|
@ -99,8 +99,13 @@ public final class LoglessPairHMM extends PairHMM {
|
|||
}
|
||||
}
|
||||
|
||||
if ( ! constantsAreInitialized || recacheReadValues )
|
||||
initializeProbabilities(insertionGOP, deletionGOP, overallGCP);
|
||||
if ( ! constantsAreInitialized || recacheReadValues ) {
|
||||
initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP);
|
||||
|
||||
// note that we initialized the constants
|
||||
constantsAreInitialized = true;
|
||||
}
|
||||
|
||||
initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);
|
||||
|
||||
for (int i = 1; i < paddedReadLength; i++) {
|
||||
|
|
@ -159,7 +164,7 @@ public final class LoglessPairHMM extends PairHMM {
|
|||
"overallGCP != null"
|
||||
})
|
||||
@Ensures("constantsAreInitialized")
|
||||
private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
|
||||
protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
|
||||
for (int i = 0; i < insertionGOP.length; i++) {
|
||||
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
|
||||
transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP);
|
||||
|
|
@ -169,9 +174,6 @@ public final class LoglessPairHMM extends PairHMM {
|
|||
transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]);
|
||||
transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
}
|
||||
|
||||
// note that we initialized the constants
|
||||
constantsAreInitialized = true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,162 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
/**
|
||||
* Useful single class carrying test data for PairHMMs (for use in benchmarking and unit tests)
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 5/12/13
|
||||
* Time: 3:52 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PairHMMTestData {
|
||||
public final String ref;
|
||||
private final String read;
|
||||
public final byte[] baseQuals, insQuals, delQuals, gcp;
|
||||
public final double log10l;
|
||||
|
||||
PairHMMTestData(String ref, String read, byte[] baseQuals, byte[] insQuals, byte[] delQuals, byte[] gcp, double log10l) {
|
||||
this.ref = ref;
|
||||
this.read = read;
|
||||
this.baseQuals = baseQuals;
|
||||
this.insQuals = insQuals;
|
||||
this.delQuals = delQuals;
|
||||
this.gcp = gcp;
|
||||
this.log10l = log10l;
|
||||
}
|
||||
|
||||
PairHMMTestData(String ref, String read, final byte qual) {
|
||||
this.ref = ref;
|
||||
this.read = read;
|
||||
this.baseQuals = this.insQuals = this.delQuals = Utils.dupBytes(qual, read.length());
|
||||
this.gcp = Utils.dupBytes((byte)10, read.length());
|
||||
this.log10l = -1;
|
||||
}
|
||||
|
||||
public double runHMM(final PairHMM hmm) {
|
||||
hmm.initialize(getRead().length(), ref.length());
|
||||
return hmm.computeReadLikelihoodGivenHaplotypeLog10(ref.getBytes(), getRead().getBytes(),
|
||||
baseQuals, insQuals, delQuals, gcp, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Info{" +
|
||||
"ref='" + ref + '\'' +
|
||||
", read='" + getRead() + '\'' +
|
||||
", log10l=" + log10l +
|
||||
'}';
|
||||
}
|
||||
|
||||
public static void runHMMs(final PairHMM hmm, final List<PairHMMTestData> data, final boolean runSingly) {
|
||||
if ( runSingly ) {
|
||||
for ( final PairHMMTestData datum : data )
|
||||
datum.runHMM(hmm);
|
||||
} else {
|
||||
// running in batch mode
|
||||
final PairHMMTestData first = data.get(0);
|
||||
int maxHaplotypeLen = calcMaxHaplotypeLen(data);
|
||||
hmm.initialize(first.getRead().length(), maxHaplotypeLen);
|
||||
for ( final PairHMMTestData datum : data ) {
|
||||
hmm.computeReadLikelihoodGivenHaplotypeLog10(datum.ref.getBytes(), datum.getRead().getBytes(),
|
||||
datum.baseQuals, datum.insQuals, datum.delQuals, datum.gcp, false);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static int calcMaxHaplotypeLen(final List<PairHMMTestData> data) {
|
||||
int maxHaplotypeLen = 0;
|
||||
for ( final PairHMMTestData datum : data )
|
||||
maxHaplotypeLen = Math.max(maxHaplotypeLen, datum.ref.length());
|
||||
return maxHaplotypeLen;
|
||||
}
|
||||
|
||||
public static Map<String, List<PairHMMTestData>> readLikelihoods(final File file) throws IOException {
|
||||
final Map<String, List<PairHMMTestData>> results = new LinkedHashMap<>();
|
||||
|
||||
InputStream in = new FileInputStream(file);
|
||||
if ( file.getName().endsWith(".gz") ) {
|
||||
in = new GZIPInputStream(in);
|
||||
}
|
||||
|
||||
for ( final String line : new XReadLines(in) ) {
|
||||
final String[] parts = line.split(" ");
|
||||
final PairHMMTestData info = new PairHMMTestData(
|
||||
parts[0], parts[1],
|
||||
SAMUtils.fastqToPhred(parts[2]),
|
||||
SAMUtils.fastqToPhred(parts[3]),
|
||||
SAMUtils.fastqToPhred(parts[4]),
|
||||
SAMUtils.fastqToPhred(parts[5]),
|
||||
Double.parseDouble(parts[6]));
|
||||
|
||||
if ( ! results.containsKey(info.read) ) {
|
||||
results.put(info.read, new LinkedList<PairHMMTestData>());
|
||||
}
|
||||
final List<PairHMMTestData> byHap = results.get(info.read);
|
||||
byHap.add(info);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
public String getRead() {
|
||||
return read;
|
||||
}
|
||||
}
|
||||
|
|
@ -70,9 +70,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
|||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -223,6 +221,150 @@ public class RecalUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Component used to print out csv representation of the reports that can be use to perform analysis in
|
||||
* external tools. E.g. generate plots using R scripts.
|
||||
* <p/>
|
||||
* A header is always printed into the output stream (or file) when the printer is created. Then you only need
|
||||
* to call {@link #print(RecalibrationReport,String) print} for each report you want to include in the csv file.
|
||||
* Once finished, you close the printer calling {@link #close() close}
|
||||
*
|
||||
*/
|
||||
private static class CsvPrinter {
|
||||
|
||||
private final PrintStream ps;
|
||||
private final Covariate[] covariates;
|
||||
|
||||
/**
|
||||
* Constructs a printer redirected to an output file.
|
||||
* @param out the output file.
|
||||
* @param c covariates to print out.
|
||||
* @throws FileNotFoundException if the file could not be created anew.
|
||||
*/
|
||||
protected CsvPrinter(final File out, final Covariate ... c)
|
||||
throws FileNotFoundException {
|
||||
this(new FileOutputStream(out), c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a printer redirected to an output stream
|
||||
* @param os the output.
|
||||
* @param c covariates to print out.
|
||||
*/
|
||||
protected CsvPrinter(final OutputStream os, final Covariate ... c) {
|
||||
covariates = c == null ? new Covariate[0] : c.clone();
|
||||
ps = new PrintStream(os);
|
||||
printHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints the header out.
|
||||
* <p/>
|
||||
* Should only be invoked at creation.
|
||||
*/
|
||||
protected void printHeader() {
|
||||
RecalUtils.printHeader(ps);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints out a report into the csv file.
|
||||
*
|
||||
*
|
||||
* @param report the report to print out.
|
||||
* @param mode the report associated mode. (typically ORIGINAL, RECALIBRATED
|
||||
*/
|
||||
public void print(final RecalibrationReport report, final String mode) {
|
||||
RecalUtils.writeCSV(ps,report.getRecalibrationTables(),mode,covariates,false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the csv printer.
|
||||
*
|
||||
* No further output will be allowed or take place after calling this method.
|
||||
*/
|
||||
public void close() {
|
||||
ps.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a csv output printer.
|
||||
*
|
||||
* @param out the output file. It will be overridden
|
||||
* @param c list of covariates to print out.
|
||||
*
|
||||
* @throws FileNotFoundException if <code>out</code> could not be created anew.
|
||||
*
|
||||
* @return never <code>null</code>
|
||||
*/
|
||||
protected static CsvPrinter csvPrinter(final File out, final Covariate ... c)
|
||||
throws FileNotFoundException
|
||||
{
|
||||
if (c == null) {
|
||||
throw new IllegalArgumentException("the input covariate array cannot be null");
|
||||
}
|
||||
return new CsvPrinter(out,c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints out a collection of reports into a file in Csv format in a way
|
||||
* that can be used by R scripts (such as the plot generator script).
|
||||
* <p/>
|
||||
* The set of covariates is take as the minimum common set from all reports.
|
||||
*
|
||||
* @param out the output file. It will be overridden.
|
||||
* @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...)
|
||||
* of each report and the corresponding value the report itself.
|
||||
* @throws FileNotFoundException if <code>out</code> could not be created anew.
|
||||
*/
|
||||
public static void generateCsv(final File out, final Map<String, RecalibrationReport> reports)
|
||||
throws FileNotFoundException {
|
||||
if (reports.size() == 0) {
|
||||
writeCsv(out, reports, new Covariate[0]);
|
||||
} else {
|
||||
final Iterator<RecalibrationReport> rit = reports.values().iterator();
|
||||
final RecalibrationReport first = rit.next();
|
||||
final Covariate[] firstCovariates = first.getRequestedCovariates();
|
||||
final Set<Covariate> covariates = new LinkedHashSet<>();
|
||||
Utils.addAll(covariates,firstCovariates);
|
||||
while (rit.hasNext() && covariates.size() > 0) {
|
||||
final Covariate[] nextCovariates = rit.next().getRequestedCovariates();
|
||||
final Set<String> nextCovariateNames = new LinkedHashSet<String>(nextCovariates.length);
|
||||
for (final Covariate nc : nextCovariates) {
|
||||
nextCovariateNames.add(nc.getClass().getSimpleName());
|
||||
}
|
||||
final Iterator<Covariate> cit = covariates.iterator();
|
||||
while (cit.hasNext()) {
|
||||
if (!nextCovariateNames.contains(cit.next().getClass().getSimpleName())) {
|
||||
cit.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
writeCsv(out, reports, covariates.toArray(new Covariate[covariates.size()]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Print out a collection of reports into a file in Csv format in a way
|
||||
* that can be used by R scripts (such as the plot generator script).
|
||||
*
|
||||
* @param out
|
||||
* @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...)
|
||||
* of each report and the corresponding value the report itself.
|
||||
* @param c the covariates to print out.
|
||||
* @throws FileNotFoundException if <code>out</code> could not be created anew.
|
||||
*/
|
||||
private static void writeCsv(final File out,
|
||||
final Map<String, RecalibrationReport> reports, final Covariate[] c)
|
||||
throws FileNotFoundException {
|
||||
final CsvPrinter p = csvPrinter(out,c);
|
||||
for (Map.Entry<String,RecalibrationReport> e : reports.entrySet()) {
|
||||
p.print(e.getValue(),e.getKey());
|
||||
}
|
||||
p.close();
|
||||
}
|
||||
|
||||
public enum SOLID_RECAL_MODE {
|
||||
/**
|
||||
* Treat reference inserted bases as reference matching bases. Very unsafe!
|
||||
|
|
@ -390,36 +532,66 @@ public class RecalUtils {
|
|||
report.print(outputFile);
|
||||
}
|
||||
|
||||
private static void outputRecalibrationPlot(final RecalibrationArgumentCollection RAC) {
|
||||
|
||||
/** s
|
||||
* Write recalibration plots into a file
|
||||
*
|
||||
* @param csvFile location of the intermediary file
|
||||
* @param exampleReportFile where the report arguments are collected from.
|
||||
* @param output result plot file name.
|
||||
*/
|
||||
public static void generatePlots(final File csvFile, final File exampleReportFile, final File output) {
|
||||
final RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.setExceptOnError(true);
|
||||
executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class));
|
||||
executor.addArgs(RAC.RECAL_CSV_FILE.getAbsolutePath());
|
||||
executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath());
|
||||
executor.addArgs(RAC.RECAL_PDF_FILE.getAbsolutePath());
|
||||
executor.addArgs(csvFile.getAbsolutePath());
|
||||
executor.addArgs(exampleReportFile.getAbsolutePath());
|
||||
executor.addArgs(output.getAbsolutePath());
|
||||
Logger.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine());
|
||||
executor.exec();
|
||||
}
|
||||
|
||||
private static void outputRecalibrationPlot(final File csvFile, final RecalibrationArgumentCollection RAC) {
|
||||
|
||||
final RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class));
|
||||
executor.addArgs(csvFile.getAbsolutePath());
|
||||
executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath());
|
||||
executor.exec();
|
||||
}
|
||||
|
||||
/**
|
||||
* Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead.
|
||||
*
|
||||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) {
|
||||
generateRecalibrationPlot(RAC, original, null, requestedCovariates);
|
||||
}
|
||||
|
||||
/**
|
||||
* Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead.
|
||||
*
|
||||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) {
|
||||
final PrintStream csvFile;
|
||||
final PrintStream csvStream;
|
||||
final File csvTempFile = null;
|
||||
try {
|
||||
if ( RAC.RECAL_CSV_FILE == null ) {
|
||||
RAC.RECAL_CSV_FILE = File.createTempFile("BQSR", ".csv");
|
||||
RAC.RECAL_CSV_FILE.deleteOnExit();
|
||||
}
|
||||
csvFile = new PrintStream(RAC.RECAL_CSV_FILE);
|
||||
File csvTmpFile = File.createTempFile("BQSR",".csv");
|
||||
csvTmpFile.deleteOnExit();
|
||||
csvStream = new PrintStream(csvTmpFile);
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_CSV_FILE, e);
|
||||
throw new UserException("Could not create temporary csv file", e);
|
||||
}
|
||||
|
||||
if ( recalibrated != null )
|
||||
writeCSV(csvFile, recalibrated, "RECALIBRATED", requestedCovariates, true);
|
||||
writeCSV(csvFile, original, "ORIGINAL", requestedCovariates, recalibrated == null);
|
||||
outputRecalibrationPlot(RAC);
|
||||
writeCSV(csvStream, recalibrated, "RECALIBRATED", requestedCovariates, true);
|
||||
writeCSV(csvStream, original, "ORIGINAL", requestedCovariates, recalibrated == null);
|
||||
csvStream.close();
|
||||
outputRecalibrationPlot(csvTempFile, RAC);
|
||||
csvTempFile.delete();
|
||||
}
|
||||
|
||||
private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) {
|
||||
|
|
@ -452,18 +624,7 @@ public class RecalUtils {
|
|||
|
||||
// output the csv file
|
||||
if (printHeader) {
|
||||
final List<String> header = new LinkedList<String>();
|
||||
header.add("ReadGroup");
|
||||
header.add("CovariateValue");
|
||||
header.add("CovariateName");
|
||||
header.add("EventType");
|
||||
header.add("Observations");
|
||||
header.add("Errors");
|
||||
header.add("EmpiricalQuality");
|
||||
header.add("AverageReportedQuality");
|
||||
header.add("Accuracy");
|
||||
header.add("Recalibration");
|
||||
deltaTableFile.println(Utils.join(",", header));
|
||||
printHeader(deltaTableFile);
|
||||
}
|
||||
|
||||
final Map<Covariate, String> covariateNameMap = new HashMap<Covariate, String>(requestedCovariates.length);
|
||||
|
|
@ -480,6 +641,21 @@ public class RecalUtils {
|
|||
}
|
||||
}
|
||||
|
||||
private static void printHeader(PrintStream out) {
|
||||
final List<String> header = new LinkedList<String>();
|
||||
header.add("ReadGroup");
|
||||
header.add("CovariateValue");
|
||||
header.add("CovariateName");
|
||||
header.add("EventType");
|
||||
header.add("Observations");
|
||||
header.add("Errors");
|
||||
header.add("EmpiricalQuality");
|
||||
header.add("AverageReportedQuality");
|
||||
header.add("Accuracy");
|
||||
header.add("Recalibration");
|
||||
out.println(Utils.join(",", header));
|
||||
}
|
||||
|
||||
/*
|
||||
* Return an initialized nested integer array with appropriate dimensions for use with the delta tables
|
||||
*
|
||||
|
|
|
|||
|
|
@ -340,9 +340,6 @@ public class RecalibrationReport {
|
|||
else if (argument.equals("recalibration_report"))
|
||||
RAC.existingRecalibrationReport = (value == null) ? null : new File((String) value);
|
||||
|
||||
else if (argument.equals("plot_pdf_file"))
|
||||
RAC.RECAL_PDF_FILE = (value == null) ? null : new File((String) value);
|
||||
|
||||
else if (argument.equals("binary_tag_name"))
|
||||
RAC.BINARY_TAG_NAME = (value == null) ? null : (String) value;
|
||||
|
||||
|
|
@ -369,6 +366,11 @@ public class RecalibrationReport {
|
|||
return RAC;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @deprecated use {@link #getRequestedCovariates()} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public Covariate[] getCovariates() {
|
||||
return requestedCovariates;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -67,6 +67,8 @@ import java.util.ArrayList;
|
|||
public class ContextCovariate implements StandardCovariate {
|
||||
private final static Logger logger = Logger.getLogger(ContextCovariate.class);
|
||||
|
||||
|
||||
|
||||
private int mismatchesContextSize;
|
||||
private int indelsContextSize;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,151 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.compression.reducereads.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.compression.reducereads.BaseCounts;
|
||||
import org.broadinstitute.sting.utils.MannWhitneyU;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class RankSumUnitTest {
|
||||
|
||||
List<Integer> distribution20, distribution30, distribution20_40;
|
||||
static final int observations = 100;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
distribution20 = new ArrayList<>(observations);
|
||||
distribution30 = new ArrayList<>(observations);
|
||||
distribution20_40 = new ArrayList<>(observations);
|
||||
|
||||
final int skew = 3;
|
||||
makeDistribution(distribution20, 20, skew, observations);
|
||||
makeDistribution(distribution30, 30, skew, observations);
|
||||
makeDistribution(distribution20_40, 20, skew, observations/2);
|
||||
makeDistribution(distribution20_40, 40, skew, observations/2);
|
||||
|
||||
// shuffle the observations
|
||||
Collections.shuffle(distribution20);
|
||||
Collections.shuffle(distribution30);
|
||||
Collections.shuffle(distribution20_40);
|
||||
}
|
||||
|
||||
private static void makeDistribution(final List<Integer> result, final int target, final int skew, final int numObservations) {
|
||||
final int rangeStart = target - skew;
|
||||
final int rangeEnd = target + skew;
|
||||
|
||||
int current = rangeStart;
|
||||
for ( int i = 0; i < numObservations; i++ ) {
|
||||
result.add(current++);
|
||||
if ( current > rangeEnd )
|
||||
current = rangeStart;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "DistributionData")
|
||||
public Object[][] makeDistributionData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final int numToReduce : Arrays.asList(0, 10, 50, 100) ) {
|
||||
tests.add(new Object[]{distribution20, distribution20, numToReduce, true, "20-20"});
|
||||
tests.add(new Object[]{distribution30, distribution30, numToReduce, true, "30-30"});
|
||||
tests.add(new Object[]{distribution20_40, distribution20_40, numToReduce, true, "20/40-20/40"});
|
||||
|
||||
tests.add(new Object[]{distribution20, distribution30, numToReduce, false, "20-30"});
|
||||
tests.add(new Object[]{distribution30, distribution20, numToReduce, false, "30-20"});
|
||||
|
||||
tests.add(new Object[]{distribution20, distribution20_40, numToReduce, false, "20-20/40"});
|
||||
tests.add(new Object[]{distribution30, distribution20_40, numToReduce, true, "30-20/40"});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "DistributionData")
|
||||
public void testDistribution(final List<Integer> distribution1, final List<Integer> distribution2, final int numToReduceIn2, final boolean distributionsShouldBeEqual, final String debugString) {
|
||||
final MannWhitneyU mannWhitneyU = new MannWhitneyU(true);
|
||||
|
||||
for ( final Integer num : distribution1 )
|
||||
mannWhitneyU.add(num, MannWhitneyU.USet.SET1);
|
||||
|
||||
final List<Integer> dist2 = new ArrayList<>(distribution2);
|
||||
if ( numToReduceIn2 > 0 ) {
|
||||
final org.broadinstitute.sting.gatk.walkers.compression.reducereads.BaseCounts counts = new BaseCounts();
|
||||
for ( int i = 0; i < numToReduceIn2; i++ ) {
|
||||
final int value = dist2.remove(0);
|
||||
counts.incr(BaseIndex.A, (byte)value, 0, false);
|
||||
}
|
||||
|
||||
final int qual = (int)counts.averageQualsOfBase(BaseIndex.A);
|
||||
for ( int i = 0; i < numToReduceIn2; i++ )
|
||||
dist2.add(qual);
|
||||
}
|
||||
|
||||
for ( final Integer num : dist2 )
|
||||
mannWhitneyU.add(num, MannWhitneyU.USet.SET2);
|
||||
|
||||
final Double result = mannWhitneyU.runTwoSidedTest().second;
|
||||
Assert.assertFalse(Double.isNaN(result));
|
||||
|
||||
if ( distributionsShouldBeEqual ) {
|
||||
// TODO -- THIS IS THE FAILURE POINT OF USING REDUCED READS WITH RANK SUM TESTS
|
||||
if ( numToReduceIn2 >= observations / 2 )
|
||||
return;
|
||||
Assert.assertTrue(result > 0.1, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0)));
|
||||
} else {
|
||||
Assert.assertTrue(result < 0.01, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -78,7 +78,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testHasAnnotsAsking1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("42889072698af972f2004ccfe8eae15e"));
|
||||
Arrays.asList("823868a4b5b5ec2cdf080c059d04d31a"));
|
||||
executeTest("test file has annotations, asking for annotations, #1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -112,7 +112,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testNoAnnotsAsking1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("7e755bb09169699b76850e76b71a5f5a"));
|
||||
Arrays.asList("6f873b3152db291e18e3a04fbce2e117"));
|
||||
executeTest("test file doesn't have annotations, asking for annotations, #1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -128,7 +128,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testExcludeAnnotations() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("e17596007d0db7673d138a9ae4890e82"));
|
||||
Arrays.asList("552c2ad9dbfaa85d51d2def93c8229c6"));
|
||||
executeTest("test exclude annotations", spec);
|
||||
}
|
||||
|
||||
|
|
@ -172,6 +172,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
executeTest("getting DB tag with HM3", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDBTagWithTwoComps() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf --comp:foo " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1,
|
||||
Arrays.asList("6afbf05090ae139f53467cf6e0e71cf4"));
|
||||
executeTest("getting DB tag with 2 comps", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoQuals() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -0,0 +1,164 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
public class VariantOverlapAnnotatorUnitTest extends BaseTest {
|
||||
private GenomeLocParser genomeLocParser;
|
||||
private IndexedFastaSequenceFile seq;
|
||||
|
||||
@BeforeClass
|
||||
public void setup() throws FileNotFoundException {
|
||||
// sequence
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
}
|
||||
|
||||
private VariantContext makeVC(final String source, final String id, final List<String> alleles) {
|
||||
final VariantContext vc = GATKVariantContextUtils.makeFromAlleles(source, "20", 10, alleles);
|
||||
return new VariantContextBuilder(vc).id(id).make();
|
||||
}
|
||||
|
||||
private VariantOverlapAnnotator makeAnnotator(final String dbSNP, final String ... overlaps) {
|
||||
final RodBinding<VariantContext> dbSNPBinding = dbSNP == null ? null : new RodBinding<>(VariantContext.class, dbSNP);
|
||||
final Map<RodBinding<VariantContext>, String> overlapBinding = new LinkedHashMap<>();
|
||||
for ( final String overlap : overlaps ) overlapBinding.put(new RodBinding<>(VariantContext.class, overlap), overlap);
|
||||
return new VariantOverlapAnnotator(dbSNPBinding, overlapBinding, genomeLocParser);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateWithSpecialNames() {
|
||||
final List<String> names = Arrays.asList("X", "Y", "Z");
|
||||
final Map<RodBinding<VariantContext>, String> overlapBinding = new LinkedHashMap<>();
|
||||
for ( final String overlap : names ) overlapBinding.put(new RodBinding<>(VariantContext.class, overlap + "Binding"), overlap);
|
||||
final VariantOverlapAnnotator annotator = new VariantOverlapAnnotator(null, overlapBinding, genomeLocParser);
|
||||
Assert.assertEquals(annotator.getOverlapNames(), names);
|
||||
}
|
||||
|
||||
@DataProvider(name = "AnnotateRsIDData")
|
||||
public Object[][] makeAnnotateRsIDData() {
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
final VariantContext callNoIDAC = makeVC("call", VCFConstants.EMPTY_ID_FIELD, Arrays.asList("A", "C"));
|
||||
final VariantContext callNoIDAT = makeVC("call", VCFConstants.EMPTY_ID_FIELD, Arrays.asList("A", "T"));
|
||||
final VariantContext callIDAC = makeVC("call", "foo", Arrays.asList("A", "C"));
|
||||
final VariantContext callExistingIDAC = makeVC("call", "rsID1", Arrays.asList("A", "C"));
|
||||
|
||||
final VariantContext dbSNP_AC = makeVC("DBSNP", "rsID1", Arrays.asList("A", "C"));
|
||||
final VariantContext dbSNP_AT = makeVC("DBSNP", "rsID2", Arrays.asList("A", "T"));
|
||||
final VariantContext dbSNP_AG = makeVC("DBSNP", "rsID3", Arrays.asList("A", "G"));
|
||||
final VariantContext dbSNP_AC_AT = makeVC("DBSNP", "rsID1;rsID2", Arrays.asList("A", "C", "T"));
|
||||
final VariantContext dbSNP_AC_AG = makeVC("DBSNP", "rsID1;rsID3", Arrays.asList("A", "C", "G"));
|
||||
|
||||
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC), dbSNP_AC.getID(), true});
|
||||
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AT), VCFConstants.EMPTY_ID_FIELD, false});
|
||||
tests.add(new Object[]{callIDAC, Arrays.asList(dbSNP_AC), "foo" + ";" + dbSNP_AC.getID(), true});
|
||||
tests.add(new Object[]{callIDAC, Arrays.asList(dbSNP_AT), "foo", false});
|
||||
tests.add(new Object[]{callExistingIDAC, Arrays.asList(dbSNP_AC), "rsID1", true});
|
||||
tests.add(new Object[]{callExistingIDAC, Arrays.asList(dbSNP_AT), "rsID1", false});
|
||||
|
||||
final VariantContext callNoIDACT = makeVC("call", VCFConstants.EMPTY_ID_FIELD, Arrays.asList("A", "C", "T"));
|
||||
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AC), dbSNP_AC.getID(), true});
|
||||
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AT), dbSNP_AT.getID(), true});
|
||||
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AG), VCFConstants.EMPTY_ID_FIELD, false});
|
||||
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AC_AT), dbSNP_AC_AT.getID(), true});
|
||||
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AC_AG), dbSNP_AC_AG.getID(), true});
|
||||
|
||||
// multiple options
|
||||
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC, dbSNP_AT), "rsID1", true});
|
||||
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AT, dbSNP_AC), "rsID1", true});
|
||||
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC_AT), "rsID1;rsID2", true});
|
||||
tests.add(new Object[]{callNoIDAT, Arrays.asList(dbSNP_AC_AT), "rsID1;rsID2", true});
|
||||
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC_AG), "rsID1;rsID3", true});
|
||||
tests.add(new Object[]{callNoIDAT, Arrays.asList(dbSNP_AC_AG), VCFConstants.EMPTY_ID_FIELD, false});
|
||||
|
||||
final VariantContext dbSNP_AC_FAIL = new VariantContextBuilder(makeVC("DBSNP", "rsID1", Arrays.asList("A", "C"))).filter("FAIL").make();
|
||||
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC_FAIL), VCFConstants.EMPTY_ID_FIELD, false});
|
||||
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "AnnotateRsIDData")
|
||||
public void testAnnotateRsID(final VariantContext toAnnotate, final List<VariantContext> dbSNPRecords, final String expectedID, final boolean expectOverlap) throws Exception {
|
||||
final VariantOverlapAnnotator annotator = makeAnnotator("dbnsp");
|
||||
final VariantContext annotated = annotator.annotateRsID(dbSNPRecords, toAnnotate);
|
||||
Assert.assertNotNull(annotated);
|
||||
Assert.assertEquals(annotated.getID(), expectedID);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "AnnotateRsIDData")
|
||||
public void testAnnotateOverlaps(final VariantContext toAnnotate, final List<VariantContext> records, final String expectedID, final boolean expectOverlap) throws Exception {
|
||||
final String name = "binding";
|
||||
final VariantOverlapAnnotator annotator = makeAnnotator(null, name);
|
||||
final VariantContext annotated = annotator.annotateOverlap(records, name, toAnnotate);
|
||||
Assert.assertNotNull(annotated);
|
||||
Assert.assertEquals(annotated.getID(), toAnnotate.getID(), "Shouldn't modify annotation");
|
||||
Assert.assertEquals(annotated.hasAttribute(name), expectOverlap);
|
||||
if ( expectOverlap ) {
|
||||
Assert.assertEquals(annotated.getAttribute(name), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -62,7 +62,7 @@ public class BeagleIntegrationTest extends WalkerTest {
|
|||
"--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " +
|
||||
"--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " +
|
||||
"--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " +
|
||||
"-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("c5522304abf0633041c7772dd7dafcea"));
|
||||
"-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("989449fa3e262b88ba126867fa3ad9fb"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("test BeagleOutputToVCF", spec);
|
||||
}
|
||||
|
|
@ -96,7 +96,7 @@ public class BeagleIntegrationTest extends WalkerTest {
|
|||
"--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
|
||||
"--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
|
||||
"--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
|
||||
"-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("d8906b67c7f9fdb5b37b8e9e050982d3"));
|
||||
"-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("e036636fcd6a748ede4a70ea47941d47"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testBeagleChangesSitesToRef",spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,362 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.*;
|
||||
|
||||
import static org.testng.Assert.assertTrue;
|
||||
|
||||
/**
|
||||
* Tests Analyze Covariates.
|
||||
* <p/>
|
||||
* Notice that since PDF report generated by R are different every-time this program
|
||||
* is executed their content won't be tested. It only will verify that file has a healthy size.
|
||||
*
|
||||
*/
|
||||
public class AnalyzeCovariatesIntegrationTest extends WalkerTest {
|
||||
|
||||
private static final String TOOL_NAME = AnalyzeCovariates.class.getSimpleName();
|
||||
|
||||
/**
|
||||
* Directory where the testdata is located.
|
||||
*/
|
||||
private static final File TEST_DATA_DIR = new File(privateTestDir,"AnalyzeCovariates");
|
||||
|
||||
/**
|
||||
* File containing the before report for normal testing.
|
||||
*/
|
||||
private static final File BEFORE_FILE = new File(TEST_DATA_DIR,"before.table");
|
||||
|
||||
/**
|
||||
* File containing the after report for normal testing.
|
||||
*/
|
||||
private static final File AFTER_FILE = new File(TEST_DATA_DIR,"after.table");
|
||||
|
||||
|
||||
/**
|
||||
* File containing the bqsr report for normal testing.
|
||||
*/
|
||||
private static final File BQSR_FILE = new File(TEST_DATA_DIR,"bqsr.table");
|
||||
|
||||
/**
|
||||
* Test the content of the generated csv file.
|
||||
*
|
||||
* @throws IOException should never happen. It would be an indicator of a
|
||||
* problem with the testing environment.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testCsvGeneration()
|
||||
throws IOException {
|
||||
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(
|
||||
buildCommandLine("%s",null,true,true,true),
|
||||
Collections.singletonList("106709d32e6f0a0a9dd6a6340ec246ab"));
|
||||
executeTest("testCsvGeneration",spec);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test the size of the generated pdf.
|
||||
* <p/>
|
||||
* Unfortunately we cannot test the content as it changes slightly
|
||||
* every time the tool is run.
|
||||
*
|
||||
* @throws IOException should never happen. It would be an
|
||||
* indicator of a problem with the testing environment.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testPdfGeneration()
|
||||
throws IOException {
|
||||
final File pdfFile = File.createTempFile("ACTest",".pdf");
|
||||
pdfFile.delete();
|
||||
pdfFile.deleteOnExit();
|
||||
|
||||
final List<String> md5 = Collections.emptyList();
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(
|
||||
buildCommandLine(null,pdfFile.toString(),true,true,true),md5);
|
||||
executeTest("testPdfGeneration",spec);
|
||||
assertTrue(pdfFile.exists(),"the pdf file was not created");
|
||||
assertTrue(pdfFile.length() > 260000,"the pdf file size does"
|
||||
+ " not reach the minimum of 260Kb");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the effect of changing some recalibration parameters.
|
||||
* @param afterFileName name of the alternative after recalibration file.
|
||||
* @param description describes what has been changed.
|
||||
* @throws IOException should never happen. It would be an
|
||||
* indicator of a problem with the testing environment.
|
||||
*/
|
||||
@Test(enabled = true, dataProvider="alternativeAfterFileProvider")
|
||||
public void testParameterChangeException(final String afterFileName,
|
||||
final String description)
|
||||
throws IOException {
|
||||
|
||||
final File pdfFile = File.createTempFile("ACTest",".pdf");
|
||||
pdfFile.deleteOnExit();
|
||||
final List<String> md5 = Collections.emptyList();
|
||||
final File afterFile = new File(TEST_DATA_DIR,afterFileName);
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(
|
||||
buildCommandLine(null,"%s",true,true,afterFile),
|
||||
1,UserException.IncompatibleRecalibrationTableParameters.class);
|
||||
executeTest("testParameterChangeException - " + description, spec);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test combinations of input and output inclusion exclusion of the command
|
||||
* line that cause an exception to be thrown.
|
||||
*
|
||||
* @param useCsvFile whether to include the output csv file.
|
||||
* @param usePdfFile whether to include the output pdf file.
|
||||
* @param useBQSRFile whether to include the -BQSR input file.
|
||||
* @param useBeforeFile whether to include the -before input file.
|
||||
* @param useAfterFile whether to include the -after input file.
|
||||
* @throws IOException never thrown, unless there is a problem with the testing environment.
|
||||
*/
|
||||
@Test(enabled = true, dataProvider="alternativeInOutAbsenceCombinations")
|
||||
public void testInOutAbsenceException(final boolean useCsvFile, final boolean usePdfFile,
|
||||
final boolean useBQSRFile, final boolean useBeforeFile, final boolean useAfterFile)
|
||||
throws IOException {
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(buildCommandLine(useCsvFile,usePdfFile,
|
||||
useBQSRFile,useBeforeFile,useAfterFile),0,UserException.class);
|
||||
executeTest("testInOutAbsencePresenceException", spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test combinations of input and output inclusion exclusion of the
|
||||
* command line that won't cause an exception.
|
||||
*
|
||||
* @param useCsvFile whether to include the output csv file.
|
||||
* @param usePdfFile whether to include the output pdf file.
|
||||
* @param useBQSRFile whether to include the -BQSR input file.
|
||||
* @param useBeforeFile whether to include the -before input file.
|
||||
* @param useAfterFile whether to include the -after input file.
|
||||
* @throws IOException never thrown, unless there is a problem with the testing environment.
|
||||
*/
|
||||
@Test(enabled = true, dataProvider="alternativeInOutAbsenceCombinations")
|
||||
public void testInOutAbsence(final boolean useCsvFile, final boolean usePdfFile,
|
||||
final boolean useBQSRFile, final boolean useBeforeFile, final boolean useAfterFile)
|
||||
throws IOException {
|
||||
final List<String> md5 = Collections.emptyList();
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(buildCommandLine(useCsvFile,usePdfFile,
|
||||
useBQSRFile,useBeforeFile,useAfterFile),md5);
|
||||
executeTest("testInOutAbsencePresence", spec);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@DataProvider
|
||||
public Iterator<Object[]> alternativeInOutAbsenceCombinations(Method m) {
|
||||
List<Object[]> result = new LinkedList<Object[]>();
|
||||
if (m.getName().endsWith("Exception")) {
|
||||
result.add(new Object[] { false, false, true, true, true });
|
||||
result.add(new Object[] { true, true, false, false ,false});
|
||||
}
|
||||
else {
|
||||
result.add(new Object[] { true, true, true, false, false });
|
||||
result.add(new Object[] { true, true, false, true, false });
|
||||
result.add(new Object[] { true, true, false, false, true });
|
||||
result.add(new Object[] { true, false,false, true, false });
|
||||
result.add(new Object[] { false, true, true, false, false });
|
||||
|
||||
}
|
||||
return result.iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide recalibration parameter change data to relevant tests.
|
||||
* @param m target test method.
|
||||
* @return never <code>null</code>.
|
||||
*/
|
||||
@DataProvider
|
||||
public Iterator<Object[]> alternativeAfterFileProvider (Method m) {
|
||||
final boolean expectsException = m.getName().endsWith("Exception");
|
||||
final List<Object[]> result = new LinkedList<Object[]>();
|
||||
for (final Object[] data : DIFFERENT_PARAMETERS_AFTER_FILES) {
|
||||
if (data[1].equals(expectsException)) {
|
||||
result.add(new Object[] { data[0], data[2] });
|
||||
}
|
||||
}
|
||||
return result.iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Triplets < alfter-grp-file, whether it should fail, what is different >
|
||||
*/
|
||||
private final Object[][] DIFFERENT_PARAMETERS_AFTER_FILES = {
|
||||
{"after-cov.table", true, "Adds additional covariate: repeat-length" },
|
||||
{"after-dpSOLID.table", true, "Change the default platform to SOLID" },
|
||||
{"after-noDp.table",true, "Unset the default platform" },
|
||||
{"after-mcs4.table", true, "Changed -mcs parameter from 2 to 4" }
|
||||
};
|
||||
|
||||
/**
|
||||
* Build the AC command line given what combinations of input and output files should be included.
|
||||
*
|
||||
* @param useCsvFile whether to include the output csv file.
|
||||
* @param usePdfFile whether to include the output pdf file.
|
||||
* @param useBQSRFile whether to include the -BQSR input file.
|
||||
* @param useBeforeFile whether to include the -before input file.
|
||||
* @param useAfterFile whether to include the -after input file.
|
||||
* @return never <code>null</code>.
|
||||
* @throws IOException never thrown, unless there is a problem with the testing environment.
|
||||
*/
|
||||
private String buildCommandLine(final boolean useCsvFile, final boolean usePdfFile,
|
||||
final boolean useBQSRFile, final boolean useBeforeFile, final boolean useAfterFile)
|
||||
throws IOException {
|
||||
|
||||
final File csvFile = useCsvFile ? File.createTempFile("ACTest",".csv") : null;
|
||||
final File pdfFile = usePdfFile ? File.createTempFile("ACTest",".pdf") : null;
|
||||
|
||||
if (csvFile != null) {
|
||||
csvFile.deleteOnExit();
|
||||
}
|
||||
|
||||
if (pdfFile != null) {
|
||||
pdfFile.deleteOnExit();
|
||||
}
|
||||
|
||||
return buildCommandLine(csvFile == null ? null : csvFile.toString(),
|
||||
pdfFile == null ? null : pdfFile.toString(),
|
||||
useBQSRFile,useBeforeFile,useAfterFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the AC command line given the output file names explicitly and what test input files to use.
|
||||
* <p/>
|
||||
*
|
||||
* @param csvFileName the csv output file, <code>null</code> if none should be provided.
|
||||
* @param pdfFileName the plots output file, <code>null</code> if none should be provided.
|
||||
* @param useBQSRFile whether to include the -BQSR input file.
|
||||
* @param useBeforeFile whether to include the -before input file.
|
||||
* @param useAfterFile whether to include the -after input file.
|
||||
*
|
||||
* @return never <code>null</code>.
|
||||
*/
|
||||
private String buildCommandLine(final String csvFileName, final String pdfFileName, final boolean useBQSRFile,
|
||||
final boolean useBeforeFile, final boolean useAfterFile) {
|
||||
return buildCommandLine(csvFileName,pdfFileName,useBQSRFile ? BQSR_FILE : null,
|
||||
useBeforeFile ? BEFORE_FILE : null,
|
||||
useAfterFile ? AFTER_FILE : null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the AC command line given the output file names and the after file name explicitly and what other
|
||||
* test input files to use.
|
||||
* <p/>
|
||||
*
|
||||
* @param csvFileName the csv output file, <code>null</code> if none should be provided.
|
||||
* @param pdfFileName the plots output file, <code>null</code> if none should be provided.
|
||||
* @param useBQSRFile whether to include the -BQSR input file.
|
||||
* @param useBeforeFile whether to include the -before input file.
|
||||
* @param afterFile the after input report file, <code>null</code> if none should be provided.
|
||||
*
|
||||
* @return never <code>null</code>.
|
||||
*/
|
||||
private String buildCommandLine(final String csvFileName, final String pdfFileName, final boolean useBQSRFile,
|
||||
final boolean useBeforeFile, final File afterFile) {
|
||||
return buildCommandLine(csvFileName,pdfFileName,useBQSRFile ? BQSR_FILE : null,
|
||||
useBeforeFile ? BEFORE_FILE : null,
|
||||
afterFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the AC command line given the output file names and the after file name explicitly and what other
|
||||
* test input files to use.
|
||||
* <p/>
|
||||
*
|
||||
* @param csvFileName the csv output file, <code>null</code> if none should be provided.
|
||||
* @param pdfFileName the plots output file, <code>null</code> if none should be provided.
|
||||
* @param bqsrFile the BQSR input report file, <code>null</code> if none should be provided.
|
||||
* @param beforeFile the before input report file, <code>null</code> if non should be provided.
|
||||
* @param afterFile the after input report file, <code>null</code> if none should be provided.
|
||||
*
|
||||
* @return never <code>null</code>.
|
||||
*/
|
||||
private String buildCommandLine(final String csvFileName, final String pdfFileName, final File bqsrFile,
|
||||
final File beforeFile, final File afterFile) {
|
||||
|
||||
final List<String> args = new LinkedList<String>();
|
||||
args.add("-T");
|
||||
args.add(TOOL_NAME);
|
||||
args.add("-R");
|
||||
args.add(hg19Reference);
|
||||
args.add("-ignoreLMT");
|
||||
|
||||
if (csvFileName != null) {
|
||||
args.add("-" + AnalyzeCovariates.CSV_ARG_SHORT_NAME);
|
||||
args.add("'" + csvFileName + "'");
|
||||
}
|
||||
if (pdfFileName != null) {
|
||||
args.add("-" + AnalyzeCovariates.PDF_ARG_SHORT_NAME);
|
||||
args.add("'" + pdfFileName + "'");
|
||||
}
|
||||
if (bqsrFile != null) {
|
||||
args.add("-BQSR");
|
||||
args.add("'" + bqsrFile.getAbsoluteFile().toString() + "'");
|
||||
}
|
||||
if (beforeFile != null) {
|
||||
args.add("-" + AnalyzeCovariates.BEFORE_ARG_SHORT_NAME);
|
||||
args.add("'" + beforeFile.getAbsolutePath().toString() + "'");
|
||||
}
|
||||
if (afterFile != null) {
|
||||
args.add("-" + AnalyzeCovariates.AFTER_ARG_SHORT_NAME);
|
||||
args.add("'" + afterFile.getAbsolutePath().toString() + "'");
|
||||
}
|
||||
return Utils.join(" ", args);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -100,23 +100,23 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
@DataProvider(name = "BQSRTest")
|
||||
public Object[][] createBQSRTestData() {
|
||||
return new Object[][]{
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "61fd466b5e94d2d67e116f6f67c9f939")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "e08b5bcdb64f4beea03730e5631a14ca")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "448a45dc154c95d1387cb5cdddb67071")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "c1e7999e445d51bbe2e775dac5325643")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "a57c16918cdfe12d55a89c21bf195279")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "836dccacf48ccda6b2843d07e8f1ef4d")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "0fb2aedc2f8d66b5821cb570f15a8c4d")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "c9953f020a65c1603a6d71aeeb1b95f3")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "85a120b7d86b61597b86b9e93decbdfc")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "5248dc49aec0323c74b496bb4928c73c")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "cb52f267e0010f849f50b0bf1de474a1")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "1425a5063ee757dbfc013df24e65a67a")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "c1c3cda8caceed619d3d439c3990cd26")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "c9953f020a65c1603a6d71aeeb1b95f3")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "5bfff0c699345cca12a9b33acf95588f")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "f805a0020eea987b79f314fa99913806")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "86075d3856eb06816a0dd81af55e421f")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "155802237e1fc7a001398b8f4bcf4b72")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "38c7916cc019fe8d134df67639422b42")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "b74e75f3c5aa90bd21af1e20f2ac8c40")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "e564505aea11464de8ed72890d9ea89a")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "380d8be121ffaddd3461ee0ac3d1a76f")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0a92fdff5fd26227c29d34eda5a32f49")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "90d8c24077e8ae9a0037a9aad5f09e31")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "c41ef02c640ef1fed4bfc03b9b33b616")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "b577cd1d529425f66db49620db09fdca")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "0b5a8e259e997e4c7b5836d4c28e6f4d")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "9ad49269c0156f8ab1173261bf23e600")},
|
||||
// make sure we work with ION torrent bam
|
||||
{new BQSRTest(b37KGReference, privateTestDir + "iontorrent.bam", "20:10,000,000-10,200,000", "", "7375c7b692e76b651c278a9fb478fa1c")},
|
||||
{new BQSRTest(b37KGReference, privateTestDir + "iontorrent.bam", "20:10,000,000-10,200,000", "", "04bfa4760767022e7f5252e6e4432cc1")},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -141,22 +141,6 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
executeTest("testBQSRFailWithoutDBSNP", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBQSRCSV() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
" -T BaseRecalibrator" +
|
||||
" -R " + b36KGReference +
|
||||
" -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" +
|
||||
" -knownSites " + b36dbSNP129 +
|
||||
" -L 1:10,000,000-10,200,000" +
|
||||
" -o /dev/null" +
|
||||
" -sortAllCols" +
|
||||
" --plot_pdf_file /dev/null" +
|
||||
" --intermediate_csv_file %s",
|
||||
Arrays.asList("90ad19143024684e3c4410dc8fd2bd9d"));
|
||||
executeTest("testBQSR-CSVfile", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBQSRFailWithSolidNoCall() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -260,7 +260,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
public void testDivideByZero() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
|
||||
// we expect to lose coverage due to the downsampling so don't run the systematic tests
|
||||
executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("c459a6153a17c2cbf8441e1918fda9c8")));
|
||||
executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("4f0ef477c0417d1eb602b323474ef377")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -89,6 +89,38 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
return variantRegionBitset;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
//// Test for leading softclips immediately followed by an insertion in the CIGAR ////
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLeadingSoftClipThenInsertion() {
|
||||
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 10);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', 10));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, 10));
|
||||
read.setMappingQuality(30);
|
||||
read.setCigarString("2S2I6M");
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 1);
|
||||
slidingWindow.addRead(read);
|
||||
slidingWindow.close(null);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLeadingHardClipThenInsertion() {
|
||||
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 8);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', 8));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, 8));
|
||||
read.setMappingQuality(30);
|
||||
read.setCigarString("2H2I6M");
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.addRead(read);
|
||||
slidingWindow.close(null);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
//// This section tests the findVariantRegions() method and related functionality ////
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
@ -221,6 +253,33 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
return count;
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMarkingRegionInCancerMode() {
|
||||
|
||||
final int contextSize = 10;
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, contextSize, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.addRead(createSimpleRead("1", 0, 34, 75));
|
||||
slidingWindow.addRead(createSimpleRead("2", 0, 97, 73));
|
||||
slidingWindow.addRead(createSimpleRead("3", 0, 98, 75));
|
||||
slidingWindow.addRead(createSimpleRead("4", 0, 98, 75));
|
||||
slidingWindow.addRead(createSimpleRead("5", 0, 98, 75));
|
||||
|
||||
final CompressionStash regions = new CompressionStash();
|
||||
regions.add(new FinishedGenomeLoc("1", 0, 89, 109, true));
|
||||
|
||||
slidingWindow.closeVariantRegions(regions, null, false);
|
||||
Assert.assertEquals(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet().length, 76 + contextSize);
|
||||
}
|
||||
|
||||
private GATKSAMRecord createSimpleRead(final String name, final int refIndex, final int alignmentStart, final int length) {
|
||||
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', length));
|
||||
read.setBaseQualities(Utils.dupBytes((byte) 30, length));
|
||||
read.setMappingQuality(60);
|
||||
return read;
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
//// This section tests the consensus creation functionality ////
|
||||
|
|
|
|||
|
|
@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testSingleSample() {
|
||||
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "850304909477afa8c2a8f128d6eedde9");
|
||||
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "1771e95aed2b3b240dc353f84e19847d");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultiSample() {
|
||||
DTTest("testMultiSample ", "-I " + multiSample, "bedd19bcf21d1a779f6706c0351c9d26");
|
||||
DTTest("testMultiSample ", "-I " + multiSample, "c7f1691dbe5f121c4a79be823d3057e5");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -74,11 +74,11 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_GGA_Pools() {
|
||||
executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "3f7d763c654f1d708323f369ea4a099b");
|
||||
executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "ceb105e3db0f2b993e3d725b0d60b6a3");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
|
||||
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "66a5a3eb657fac5c621bc0c228ea9caf");
|
||||
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "98f4d78aad745c6e853b81b2e4e207b4");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
|
||||
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","5eabc12fc7b4f9749e6d1be0f5b45d14");
|
||||
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","25902d7a6a0c00c60c2d5845dfaa1a4c");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("19f77f557150905ef3fa4713f611a1b9"));
|
||||
Arrays.asList("ef8151aa699da3272c1ae0986d16ca21"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX"), spec);
|
||||
}
|
||||
|
|
@ -88,7 +88,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
" -minIndelCnt 1" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("d9572a227ccb13a6baa6dc4fb65bc1e5"));
|
||||
Arrays.asList("7f88229ccefb74513efb199b61183cb8"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
|
||||
}
|
||||
|
|
@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("bb3dbad9666ebf38d338f0c9c211a42e"));
|
||||
Arrays.asList("1928ad48bcd0ca180e046bc235cfb3f4"));
|
||||
|
||||
executeTest(String.format("test indel calling, multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("8052390ca2b6a57c3ddf379a51225d64"));
|
||||
Arrays.asList("6663e434a7b549f23bfd52db90e53a1a"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
|
||||
}
|
||||
|
||||
|
|
@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
||||
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("b6b9dba97fbabaeeb458a41051983e7b"));
|
||||
Arrays.asList("581c552664e536df6d0f102fb0d10e5a"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
|
||||
}
|
||||
|
||||
|
|
@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
|
||||
Arrays.asList("38730c7030271f5d0ca0b59365d57814"));
|
||||
Arrays.asList("5596851d19582dd1af3901b7d703ae0a"));
|
||||
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction0() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
|
||||
Arrays.asList("264325878b988acc11d8e5d9d2ba0b7f"));
|
||||
Arrays.asList("862d82c8aa35f1da4f9e67b5b48dfe52"));
|
||||
executeTest("test minIndelFraction 0.0", spec);
|
||||
}
|
||||
|
||||
|
|
@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction25() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
|
||||
Arrays.asList("98abcfb0a008050eba8b9c285a25b2a0"));
|
||||
Arrays.asList("8d9fc96be07db791737ac18135de4d63"));
|
||||
executeTest("test minIndelFraction 0.25", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -46,11 +46,15 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
|
|
@ -156,6 +160,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emitPLsAtAllSites() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1,
|
||||
Arrays.asList("7cc55db8693759e059a05bc4398f6f69"));
|
||||
executeTest("test all site PLs 1", spec1);
|
||||
|
||||
}
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing heterozygosity
|
||||
|
|
@ -288,9 +300,24 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testNsInCigar() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1,
|
||||
Arrays.asList("2ae3fd39c53a6954d32faed8703adfe8"));
|
||||
UserException.UnsupportedCigarOperatorException.class);
|
||||
|
||||
executeTest("test calling on reads with Ns in CIGAR", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCompressedVCFOutputWithNT() throws Exception {
|
||||
WalkerTestSpec spec = new WalkerTestSpec("-T UnifiedGenotyper -R " + b37KGReference + " -I "
|
||||
+ privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"
|
||||
+ " -o %s -L 20:10,000,000-10,100,000 -nt 4",
|
||||
1, Arrays.asList("vcf.gz"), Arrays.asList(""));
|
||||
final File vcf = executeTest("testCompressedVCFOutputWithNT", spec).first.get(0);
|
||||
final AsciiLineReader reader = new AsciiLineReader(new BlockCompressedInputStream(vcf));
|
||||
int nLines = 0;
|
||||
while ( reader.readLine() != null )
|
||||
nLines++;
|
||||
Assert.assertTrue(nLines > 0);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("5e8f1fa88dc93320cc0e75e9fe6e153b"));
|
||||
Arrays.asList("a9466c1e3ce1fc4bac83086b25a6df54"));
|
||||
executeTest("test MultiSample Pilot1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -80,7 +80,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testWithAllelesPassedIn2() {
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
|
||||
Arrays.asList("698e54aeae3130779d246b9480a4052c"));
|
||||
Arrays.asList("3e646003c5b93da80c7d8e5d0ff2ee4e"));
|
||||
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("1ab95513a3abb5b760578831c61ef94b"));
|
||||
Arrays.asList("06c85e8eab08b67244cf38fc785aca22"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -112,7 +112,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testReverseTrim() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
|
||||
Arrays.asList("314b99eb146de1fdafed872ecbe1cfc2"));
|
||||
Arrays.asList("f3da1ff1e49a831af055ca52d6d07dd7"));
|
||||
executeTest("test reverse trim", spec);
|
||||
}
|
||||
|
||||
|
|
@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testMismatchedPLs() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
|
||||
Arrays.asList("94bfccbd06043e90ae1b1c66fc3afe07"));
|
||||
Arrays.asList("20ff311f363c51b7385a76f6f296759c"));
|
||||
executeTest("test mismatched PLs", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
|
|||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("e6565060b44a7804935973efcd56e596"));
|
||||
Arrays.asList("ffde0d5e23523e4bd9e7e18f62d37d0f"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "ab776d74c41ce2b859e2b2466a76204a");
|
||||
testReducedCalling("SNP", "e8de8c523751ad2fa2ee20185ba5dea7");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamINDELs() {
|
||||
testReducedCalling("INDEL", "19bc6a74250ec19efc4e1b4ee6515ac0");
|
||||
testReducedCalling("INDEL", "4b4902327fb132f9aaab3dd5ace934e1");
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -73,8 +73,8 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
|
|||
public void testReferenceCycleGraph() {
|
||||
String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC";
|
||||
String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC";
|
||||
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true));
|
||||
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true));
|
||||
final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(refCycle.getBytes(), true), Collections.<Haplotype>emptyList());
|
||||
final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList<GATKSAMRecord>(), 10, new Haplotype(noCycle.getBytes(), true), Collections.<Haplotype>emptyList());
|
||||
|
||||
Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation.");
|
||||
Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
|
||||
|
|
@ -153,4 +153,47 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
|
|||
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "AddGGAKmersToGraph")
|
||||
public Object[][] makeAddGGAKmersToGraphData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
|
||||
final int readLen = bases.length();
|
||||
final List<Integer> allBadStarts = new ArrayList<Integer>(readLen);
|
||||
for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i);
|
||||
|
||||
for ( final int kmerSize : Arrays.asList(3, 4, 5) ) {
|
||||
tests.add(new Object[]{bases, kmerSize});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "AddGGAKmersToGraph", enabled = ! DEBUG)
|
||||
public void testAddGGAKmersToGraph(final String bases, final int kmerSize) {
|
||||
final int readLen = bases.length();
|
||||
final DeBruijnAssembler assembler = new DeBruijnAssembler();
|
||||
final MockBuilder builder = new MockBuilder(kmerSize);
|
||||
|
||||
final Set<String> expectedBases = new HashSet<String>();
|
||||
final Set<Integer> expectedStarts = new LinkedHashSet<Integer>();
|
||||
for ( int i = 0; i < readLen; i++) {
|
||||
boolean good = true;
|
||||
for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing
|
||||
good &= i + j < readLen;
|
||||
}
|
||||
if ( good ) {
|
||||
expectedStarts.add(i);
|
||||
expectedBases.add(bases.substring(i, i + kmerSize + 1));
|
||||
}
|
||||
}
|
||||
|
||||
assembler.addGGAKmersToGraph(builder, Arrays.asList(new Haplotype(bases.getBytes())));
|
||||
Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
|
||||
for ( final Kmer addedKmer : builder.addedPairs ) {
|
||||
Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex1() {
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fc11b553fbf16beac0da04a69f419365");
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "12ed9d67139e7a94d67e9e6c06ac6e16");
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
|
|
@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
}
|
||||
|
||||
private void HCTestComplexGGA(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec);
|
||||
}
|
||||
|
|
@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAComplex() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
|
||||
"90cbcc7e959eb591fb7c5e12d65e0e40");
|
||||
"b7a01525c00d02b3373513a668a43c6a");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"50894abb9d156bf480881cb5cb2a8a7d");
|
||||
"a2a42055b068334f415efb07d6bb9acd");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
||||
@DataProvider(name = "MyDataProvider")
|
||||
public Object[][] makeMyDataProvider() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000";
|
||||
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
|
||||
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "2b54e4e948144030a829175bcd295e47"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ba1bb72caa06c1962a202b2012c266cb"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a841d9e94fb832066a04f13bdc62b101"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "6cc95c47368a568fb9e1eb8578f96b0b"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2703f1c0c27b3c977689604b5f78b61f"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b54e36bbb4dc6c3b786349fa267d1f6c"});
|
||||
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Example testng test using MyDataProvider
|
||||
*/
|
||||
@Test(dataProvider = "MyDataProvider")
|
||||
public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) {
|
||||
final String commandLine = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s %s -ERC %s --no_cmdline_in_header",
|
||||
b37KGReference, bam, intervals, mode);
|
||||
final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
|
||||
executeTest(name, spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -47,15 +47,12 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broad.tribble.TribbleIndexedFeatureReader;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -69,6 +66,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
|
||||
final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
|
||||
final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam";
|
||||
final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam";
|
||||
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
|
||||
|
||||
|
|
@ -80,12 +78,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "37e462379de17bc6c8aeeed6e9735dd3");
|
||||
HCTest(CEUTRIO_BAM, "", "baa5a2eedc8f06ce9f8f98411ee09f8a");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "983a0d122714d4aa0ff7af20cc686703");
|
||||
HCTest(NA12878_BAM, "", "f09e03d41238697b23f95716a12667cb");
|
||||
}
|
||||
|
||||
@Test(enabled = false) // can't annotate the rsID's yet
|
||||
|
|
@ -95,13 +93,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
"dbbc884a975587d8e7255ce47b58f438");
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
"130d36448faeb7b8d4bce4be12dacd3a");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerInsertionOnEdgeOfContig() {
|
||||
HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae");
|
||||
HCTest(CEUTRIO_MT_TEST_BAM, "-L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae");
|
||||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
|
|
@ -112,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "ce602282e80cca6d4272f940e20e90c3");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "7c20aa62633f4ce8ebf12950fbf05ec0");
|
||||
}
|
||||
|
||||
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
|
||||
|
|
@ -149,7 +147,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerNearbySmallIntervals() {
|
||||
HCTestNearbySmallIntervals(NA12878_BAM, "", "09335c01d2e90714af7f4c91156da0b1");
|
||||
HCTestNearbySmallIntervals(NA12878_BAM, "", "0ddc56f0a0fbcfefda79aa20b2ecf603");
|
||||
}
|
||||
|
||||
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
|
||||
|
|
@ -159,14 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b34ddc93a7b9919e05da499508f44dd9"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("98a78b9f58ab197b827ef2ce3ab043d3"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
|
|
@ -188,7 +186,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("6e6ef6e0326bee6d20d9fd37349fdb8c"));
|
||||
Arrays.asList("5fe9310addf881bed4fde2354e59ce34"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -196,7 +194,30 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void testReducedBamWithReadsNotFullySpanningDeletion() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
||||
Arrays.asList("5e535983b2f7e5fb6c84fecffa092324"));
|
||||
Arrays.asList("26a9917f6707536636451266de0116c3"));
|
||||
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// test dbSNP annotation
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void HCTestDBSNPAnnotationWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||
Arrays.asList("c5c63d03e1c4bbe32f06902acd4a10f9"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestDBSNPAnnotationWEx() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
|
||||
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
|
||||
Arrays.asList("f0b2a96040429908cce17327442eec29"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
|
|||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final int nct : Arrays.asList(1, 2, 4) ) {
|
||||
tests.add(new Object[]{nct, "c277fd65365d59b734260dd8423313bb"});
|
||||
tests.add(new Object[]{nct, "e800f6bb3a820da5c6b29f0195480796"});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
|
|
|
|||
|
|
@ -47,13 +47,12 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class KmerUnitTest extends BaseTest {
|
||||
@DataProvider(name = "KMerCreationData")
|
||||
|
|
@ -130,4 +129,40 @@ public class KmerUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDifferingPositions() {
|
||||
final String bases = "ACGTCAGACGTACGTTTGACGTCAGACGTACGT";
|
||||
final Kmer baseKmer = new Kmer(bases.getBytes());
|
||||
|
||||
|
||||
final int NUM_TEST_CASES = 30;
|
||||
|
||||
for (int test = 0; test < NUM_TEST_CASES; test++) {
|
||||
|
||||
final int numBasesToChange = test % bases.length();
|
||||
|
||||
// changes numBasesToChange bases - spread regularly through read string
|
||||
final int step = (numBasesToChange > 0?Math.min(bases.length() / numBasesToChange,1) : 1);
|
||||
|
||||
final byte[] newBases = bases.getBytes().clone();
|
||||
int actualChangedBases =0; // could be different from numBasesToChange due to roundoff
|
||||
for (int idx=0; idx < numBasesToChange; idx+=step) {
|
||||
// now change given positions
|
||||
newBases[idx] = (newBases[idx] == (byte)'A'? (byte)'T':(byte)'A');
|
||||
actualChangedBases++;
|
||||
}
|
||||
|
||||
// compute changed positions
|
||||
final int[] differingIndices = new int[newBases.length];
|
||||
final byte[] differingBases = new byte[newBases.length];
|
||||
final int numDiffs = baseKmer.getDifferingPositions(new Kmer(newBases),newBases.length,differingIndices,differingBases);
|
||||
Assert.assertEquals(numDiffs,actualChangedBases);
|
||||
for (int k=0; k < numDiffs; k++) {
|
||||
final int idx = differingIndices[k];
|
||||
Assert.assertTrue(newBases[idx] != bases.getBytes()[idx]);
|
||||
Assert.assertEquals(differingBases[idx],newBases[idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,6 +47,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
|
||||
|
|
@ -216,11 +219,15 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
|
|||
|
||||
private List<Haplotype> assemble(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final List<GATKSAMRecord> reads) {
|
||||
final Haplotype refHaplotype = new Haplotype(refBases, true);
|
||||
final Cigar c = new Cigar();
|
||||
c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
|
||||
refHaplotype.setCigar(c);
|
||||
|
||||
final ActiveRegion activeRegion = new ActiveRegion(loc, null, true, genomeLocParser, 0);
|
||||
activeRegion.addAll(reads);
|
||||
final LocalAssemblyEngine engine = createAssembler(assembler);
|
||||
// logger.warn("Assembling " + activeRegion + " with " + engine);
|
||||
return engine.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.<VariantContext>emptyList());
|
||||
return engine.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.<VariantContext>emptyList(), null);
|
||||
}
|
||||
|
||||
@DataProvider(name = "SimpleAssemblyTestData")
|
||||
|
|
@ -244,7 +251,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
|
|||
for ( int snpPos = 0; snpPos < windowSize; snpPos++) {
|
||||
if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) {
|
||||
final byte[] altBases = ref.getBytes();
|
||||
altBases[snpPos] = 'N';
|
||||
altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A';
|
||||
final String alt = new String(altBases);
|
||||
tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,190 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class ReadErrorCorrectorUnitTest {
|
||||
private static final boolean debug = true;
|
||||
final String refChunk = "GCATAAACATGGCTCACTGC";
|
||||
final String refChunkHard = "AGCCTTGAACTCCTGGGCTCAAGTGATCCTCCTGCCTCAGTTTCCCATGTAGCTGGGACCACAGGTGGGGGCTCCACCCCTGGCTGATTTTTTTTTTTTTTTTTTTTTGAGATAGGGT";
|
||||
|
||||
@Test
|
||||
public void TestBasicCorrectionSet() {
|
||||
|
||||
final byte[] trueBases = refChunk.getBytes();
|
||||
final int numCorrections = 50;
|
||||
final ReadErrorCorrector.CorrectionSet correctionSet = new ReadErrorCorrector.CorrectionSet(trueBases.length);
|
||||
|
||||
int offset = 2;
|
||||
for (int k=0; k < numCorrections; k++) {
|
||||
// introduce one correction at a random offset in array. To make testing easier, we will replicate corrrection
|
||||
final byte base = trueBases[offset];
|
||||
correctionSet.add(offset, base);
|
||||
// skip to some other offset
|
||||
offset += 7;
|
||||
if (offset >= trueBases.length)
|
||||
offset -= trueBases.length;
|
||||
}
|
||||
|
||||
for (int k=0; k < trueBases.length; k++) {
|
||||
final byte corr = correctionSet.getConsensusCorrection(k);
|
||||
Assert.assertEquals(corr, trueBases[k]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestExtendedCorrectionSet() {
|
||||
|
||||
final byte[] trueBases = refChunk.getBytes();
|
||||
final int numCorrections = 50;
|
||||
final ReadErrorCorrector.CorrectionSet correctionSet = new ReadErrorCorrector.CorrectionSet(trueBases.length);
|
||||
|
||||
for (int offset=0; offset < trueBases.length; offset++) {
|
||||
// insert k corrections at offset k and make sure we get exactly k bases back
|
||||
for (int k=0; k < offset; k++)
|
||||
correctionSet.add(offset,trueBases[offset]);
|
||||
|
||||
}
|
||||
|
||||
for (int offset=0; offset < trueBases.length; offset++) {
|
||||
Assert.assertEquals(correctionSet.get(offset).size(),offset);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestAddReadsToKmers() {
|
||||
final int NUM_GOOD_READS = 500;
|
||||
|
||||
final String bases = "AAAAAAAAAAAAAAA";
|
||||
final int READ_LENGTH = bases.length();
|
||||
final int kmerLengthForReadErrorCorrection = READ_LENGTH;
|
||||
final List<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>(NUM_GOOD_READS);
|
||||
int offset = 0;
|
||||
final byte[] quals = new byte[READ_LENGTH];
|
||||
|
||||
Arrays.fill(quals,(byte)30);
|
||||
|
||||
for (int k=0; k < NUM_GOOD_READS; k++) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases.getBytes(), quals,READ_LENGTH+"M");
|
||||
finalizedReadList.add(read);
|
||||
}
|
||||
|
||||
ReadErrorCorrector readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection,(byte)6,10, debug,refChunkHard.getBytes());
|
||||
readErrorCorrector.addReadsToKmers(finalizedReadList);
|
||||
|
||||
// special trivial case: kmer length is equal to read length.
|
||||
// K-mer counter should hold then exactly one kmer
|
||||
Assert.assertEquals(readErrorCorrector.countsByKMer.getCountedKmers().size(), 1);
|
||||
for (final KMerCounter.CountedKmer kmer : readErrorCorrector.countsByKMer.getCountedKmers()) {
|
||||
Assert.assertTrue(Arrays.equals( kmer.getKmer().bases(),bases.getBytes()));
|
||||
Assert.assertEquals(kmer.getCount(),NUM_GOOD_READS);
|
||||
}
|
||||
|
||||
// special case 2: kmers are all the same but length < read length.
|
||||
// Each kmer is added then readLength-kmerLength+1 times
|
||||
final int KMER_LENGTH = 10;
|
||||
readErrorCorrector = new ReadErrorCorrector(KMER_LENGTH,(byte)6,10, debug,refChunkHard.getBytes());
|
||||
readErrorCorrector.addReadsToKmers(finalizedReadList);
|
||||
Assert.assertEquals(readErrorCorrector.countsByKMer.getCountedKmers().size(), 1);
|
||||
for (final KMerCounter.CountedKmer kmer : readErrorCorrector.countsByKMer.getCountedKmers()) {
|
||||
Assert.assertEquals(kmer.getCount(),NUM_GOOD_READS*(READ_LENGTH-KMER_LENGTH+1));
|
||||
}
|
||||
|
||||
}
|
||||
@Test
|
||||
public void TestBasicErrorCorrection() {
|
||||
final int NUM_GOOD_READS = 500;
|
||||
final int NUM_BAD_READS = 10;
|
||||
final int READ_LENGTH = 15;
|
||||
final int kmerLengthForReadErrorCorrection = 10;
|
||||
final List<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>(NUM_GOOD_READS);
|
||||
int offset = 0;
|
||||
final byte[] quals = new byte[READ_LENGTH];
|
||||
|
||||
Arrays.fill(quals,(byte)30);
|
||||
|
||||
for (int k=0; k < NUM_GOOD_READS; k++) {
|
||||
final byte[] bases = Arrays.copyOfRange(refChunk.getBytes(),offset,offset+READ_LENGTH);
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals,READ_LENGTH+"M");
|
||||
finalizedReadList.add(read);
|
||||
offset++;
|
||||
if (offset >= refChunk.length()-READ_LENGTH)
|
||||
offset = 0;
|
||||
}
|
||||
offset = 2;
|
||||
// coverage profile is now perfectly triangular with "good" bases. Inject now bad bases with errors in them.
|
||||
for (int k=0; k < NUM_BAD_READS; k++) {
|
||||
final byte[] bases = finalizedReadList.get(k).getReadBases().clone();
|
||||
bases[offset] = 'N';
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, READ_LENGTH + "M");
|
||||
finalizedReadList.add(read);
|
||||
offset += 7;
|
||||
if (offset >= READ_LENGTH)
|
||||
offset = 4; // just some randomly circulating offset for error position
|
||||
}
|
||||
|
||||
// now correct all reads
|
||||
final ReadErrorCorrector readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection,(byte)6,10, debug,refChunkHard.getBytes());
|
||||
readErrorCorrector.addReadsToKmers(finalizedReadList);
|
||||
readErrorCorrector.correctReads(finalizedReadList);
|
||||
|
||||
// check that corrected reads have exactly same content as original reads
|
||||
for (int k=0; k < NUM_BAD_READS; k++) {
|
||||
final byte[] badBases = finalizedReadList.get(k).getReadBases();
|
||||
final byte[] originalBases = finalizedReadList.get(k).getReadBases();
|
||||
Assert.assertTrue(Arrays.equals(badBases,originalBases));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,408 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeType;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class ReferenceConfidenceModelUnitTest extends BaseTest {
|
||||
GenomeLocParser parser;
|
||||
final String RGID = "ID1";
|
||||
GATKSAMReadGroupRecord rg;
|
||||
final String sample = "NA12878";
|
||||
final Set<String> samples = Collections.singleton(sample);
|
||||
SAMFileHeader header;
|
||||
ReferenceConfidenceModel model;
|
||||
|
||||
@BeforeClass
|
||||
public void setUp() throws Exception {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||
rg = new GATKSAMReadGroupRecord(RGID);
|
||||
rg.setSample(sample);
|
||||
header.addReadGroup(rg);
|
||||
parser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
}
|
||||
|
||||
@BeforeMethod
|
||||
public void setupModel() {
|
||||
model = new ReferenceConfidenceModel(parser, samples, header, 10);
|
||||
}
|
||||
|
||||
@DataProvider(name = "CalcNIndelInformativeReadsData")
|
||||
public Object[][] makeMyDataProvider() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
{ // very basic testing
|
||||
final String ref = "ACGT";
|
||||
final String read = "ACGT";
|
||||
tests.add(new Object[]{read, ref, 1, Arrays.asList(1, 1, 1, 0)});
|
||||
tests.add(new Object[]{read, ref, 2, Arrays.asList(1, 1, 0, 0)});
|
||||
tests.add(new Object[]{read, ref, 3, Arrays.asList(1, 0, 0, 0)});
|
||||
tests.add(new Object[]{read, ref, 4, Arrays.asList(0, 0, 0, 0)});
|
||||
}
|
||||
|
||||
{ // actually interesting case where some sites aren't informative
|
||||
final String ref = "NNAAAANN";
|
||||
final String read1 = "NNA";
|
||||
final String read2 = "NNAA";
|
||||
final String read3 = "NNAAA";
|
||||
final String read4 = "NNAAAA";
|
||||
final String read5 = "NNAAAAN";
|
||||
tests.add(new Object[]{read1, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)});
|
||||
tests.add(new Object[]{read2, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)});
|
||||
tests.add(new Object[]{read3, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)});
|
||||
tests.add(new Object[]{read4, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)});
|
||||
tests.add(new Object[]{read5, ref, 1, Arrays.asList(1, 1, 1, 1, 1, 1, 0, 0)});
|
||||
}
|
||||
|
||||
{
|
||||
for ( final String repeatUnit : Arrays.asList("A", "CA", "TAG", "TAGC", "TCAGA")) {
|
||||
final String anchor = Utils.dupString("N", repeatUnit.length());
|
||||
for ( int nUnits = 1; nUnits < 10; nUnits++ ) {
|
||||
final String repeat = Utils.dupString(repeatUnit, nUnits);
|
||||
final String ref = anchor + repeat + anchor;
|
||||
for ( int readLen = repeatUnit.length(); readLen < repeat.length(); readLen++ ) {
|
||||
final String read = anchor + repeat.substring(0, readLen);
|
||||
final List<Integer> expected = new LinkedList<>();
|
||||
for ( int i = 0; i < anchor.length(); i++ ) expected.add(1);
|
||||
for ( int i = 0; i < repeat.length(); i++ ) expected.add(readLen == repeat.length() ? 1 : 0);
|
||||
for ( int i = 0; i < anchor.length(); i++ ) expected.add(0);
|
||||
tests.add(new Object[]{read, ref, repeatUnit.length(), expected});
|
||||
|
||||
final List<Integer> result = new ArrayList<>(Collections.nCopies(ref.length() - anchor.length(), 1));
|
||||
result.addAll(Collections.nCopies(anchor.length(), 0));
|
||||
tests.add(new Object[]{ref, ref, repeatUnit.length(), result});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CalcNIndelInformativeReadsData")
|
||||
public void testCalcNIndelInformativeReads(final String readBases, final String ref, final int maxIndelSize, final List<Integer> expected ) {
|
||||
final byte qual = (byte)30;
|
||||
final byte[] quals = Utils.dupBytes(qual, readBases.length());
|
||||
|
||||
for ( int i = 0; i < readBases.getBytes().length; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), quals, readBases.length() + "M");
|
||||
final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, i, i);
|
||||
final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), i);
|
||||
final int actual = model.calcNIndelInformativeReads(pileup, i, ref.getBytes(), maxIndelSize);
|
||||
Assert.assertEquals(actual, (int)expected.get(i), "failed at position " + i);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClose() {
|
||||
model.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWorstGL() {
|
||||
final GenotypeLikelihoods gq10 = GenotypeLikelihoods.fromPLField("0,10,100");
|
||||
final GenotypeLikelihoods gq20 = GenotypeLikelihoods.fromPLField("0,20,200");
|
||||
final GenotypeLikelihoods gq0 = GenotypeLikelihoods.fromPLField("20,0,200");
|
||||
|
||||
Assert.assertSame(model.getGLwithWorstGQ(gq10, gq20), gq10);
|
||||
Assert.assertSame(model.getGLwithWorstGQ(gq20, gq10), gq10);
|
||||
Assert.assertSame(model.getGLwithWorstGQ(gq10, gq0), gq0);
|
||||
Assert.assertSame(model.getGLwithWorstGQ(gq0, gq10), gq0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIndelLikelihoods() {
|
||||
GenotypeLikelihoods prev = model.getIndelPLs(0);
|
||||
Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0});
|
||||
Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0);
|
||||
|
||||
for ( int i = 1; i < 10000; i++ ) {
|
||||
final GenotypeLikelihoods current = model.getIndelPLs(i);
|
||||
final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF);
|
||||
final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF);
|
||||
Assert.assertTrue(prevGQ < currGQ, "GQ Failed with prev " + prev + " curr " + current + " at " + i);
|
||||
Assert.assertTrue(prev.getAsPLs()[1] < current.getAsPLs()[1], "het PL failed with prev " + prev + " curr " + current + " at " + i);
|
||||
Assert.assertTrue(prev.getAsPLs()[2] < current.getAsPLs()[2], "hom-var PL Failed with prev " + prev + " curr " + current + " at " + i);
|
||||
// logger.warn("result at " + i + " is " + current);
|
||||
prev = current;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOverlappingVariantContext() {
|
||||
final VariantContext vc10 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 10, Arrays.asList("A", "C"));
|
||||
final VariantContext vc13 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 13, Arrays.asList("A", "C"));
|
||||
final VariantContext vc12_15 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 12, Arrays.asList("ACAT", "A"));
|
||||
final VariantContext vc18 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 18, Arrays.asList("A", "ACAT"));
|
||||
|
||||
final List<VariantContext> calls = Arrays.asList(vc13, vc12_15, vc18, vc10);
|
||||
|
||||
checkOverlapping(8, calls, null);
|
||||
checkOverlapping(9, calls, null);
|
||||
checkOverlapping(10, calls, vc10);
|
||||
checkOverlapping(11, calls, null);
|
||||
checkOverlapping(12, calls, vc12_15);
|
||||
checkOverlapping(13, calls, vc13);
|
||||
checkOverlapping(14, calls, vc12_15);
|
||||
checkOverlapping(15, calls, vc12_15);
|
||||
checkOverlapping(16, calls, null);
|
||||
checkOverlapping(17, calls, null);
|
||||
checkOverlapping(18, calls, vc18);
|
||||
checkOverlapping(19, calls, null);
|
||||
checkOverlapping(20, calls, null);
|
||||
}
|
||||
|
||||
private void checkOverlapping(final int pos, Collection<VariantContext> calls, final VariantContext expected) {
|
||||
final GenomeLoc loc = parser.createGenomeLoc(parser.getContigs().getSequences().get(0).getSequenceName(), pos, pos);
|
||||
final VariantContext actual = model.getOverlappingVariantContext(loc, calls);
|
||||
Assert.assertEquals(actual, expected);
|
||||
}
|
||||
|
||||
//
|
||||
// test reference calculation
|
||||
//
|
||||
private class RefConfData {
|
||||
final String ref;
|
||||
final int extension;
|
||||
final Haplotype refHap;
|
||||
final GenomeLoc refLoc, paddedRefLoc;
|
||||
final ActiveRegion region;
|
||||
int readCounter = 0;
|
||||
|
||||
private RefConfData(String ref, int extension) {
|
||||
this.ref = ref;
|
||||
this.extension = extension;
|
||||
|
||||
refLoc = parser.createGenomeLoc("chr1", getStart(), getEnd());
|
||||
paddedRefLoc = parser.createGenomeLoc("chr1", getStart() - extension, getEnd() + extension);
|
||||
region = new ActiveRegion(getRefLoc(), parser, extension);
|
||||
final String pad = Utils.dupString("N", extension);
|
||||
refHap = ReferenceConfidenceModel.createReferenceHaplotype(getActiveRegion(), (pad + ref + pad).getBytes(), getPaddedRefLoc());
|
||||
}
|
||||
|
||||
public GenomeLoc getRefLoc() { return refLoc; }
|
||||
public GenomeLoc getPaddedRefLoc() { return paddedRefLoc; }
|
||||
public ActiveRegion getActiveRegion() { return region; }
|
||||
public Haplotype getRefHap() { return refHap; }
|
||||
public int getStart() { return 100; }
|
||||
public int getEnd() { return getStart() + getRefLength() - 1; }
|
||||
public byte[] getRefBases() { return ref.getBytes(); }
|
||||
public int getRefLength() { return ref.length(); }
|
||||
|
||||
public GATKSAMRecord makeRead(final int start, final int length) {
|
||||
final byte[] quals = Utils.dupBytes((byte)30, length);
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read " + readCounter++, 0, start + getStart(), ref.substring(start, start + length).getBytes(), quals, length + "M");
|
||||
read.setReadGroup(rg);
|
||||
return read;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@DataProvider(name = "RefConfidenceData")
|
||||
public Object[][] makeRefConfidenceData() {
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
for ( int i = 0; i < 10; i++ ) {
|
||||
for ( final int extension : Arrays.asList(0, 10) ) {
|
||||
tests.add(new Object[]{i, extension});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "RefConfidenceData")
|
||||
public void testRefConfidenceBasic(final int nReads, final int extension) {
|
||||
final RefConfData data = new RefConfData("ACGTAACCGGTT", extension);
|
||||
final List<Haplotype> haplotypes = Arrays.asList(data.getRefHap());
|
||||
final List<VariantContext> calls = Collections.emptyList();
|
||||
|
||||
for ( int i = 0; i < nReads; i++ ) {
|
||||
data.getActiveRegion().add(data.makeRead(0, data.getRefLength()));
|
||||
}
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
|
||||
|
||||
final List<Integer> expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads);
|
||||
final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
|
||||
checkReferenceModelResult(data, contexts, expectedDPs, calls);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRefConfidencePartialReads() {
|
||||
final String ref = "ACGTAACCGGTT";
|
||||
for ( int readLen = 3; readLen < ref.length(); readLen++ ) {
|
||||
for ( int start = 0; start < ref.length() - readLen; start++ ) {
|
||||
final RefConfData data = new RefConfData(ref, 0);
|
||||
final List<Haplotype> haplotypes = Arrays.asList(data.getRefHap());
|
||||
final List<VariantContext> calls = Collections.emptyList();
|
||||
|
||||
data.getActiveRegion().add(data.makeRead(start, readLen));
|
||||
final Map<String, PerReadAlleleLikelihoodMap> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
|
||||
|
||||
final List<Integer> expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0));
|
||||
for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1);
|
||||
final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
|
||||
checkReferenceModelResult(data, contexts, expectedDPs, calls);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRefConfidenceWithCalls() {
|
||||
final RefConfData xxxdata = new RefConfData("ACGTAACCGGTT", 0);
|
||||
final int start = xxxdata.getStart();
|
||||
final int stop = xxxdata.getEnd();
|
||||
|
||||
for ( int nReads = 0; nReads < 2; nReads++ ) {
|
||||
|
||||
final VariantContext vcStart = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start, Arrays.asList("A", "C"));
|
||||
final VariantContext vcEnd = GATKVariantContextUtils.makeFromAlleles("test", "chr1", stop, Arrays.asList("A", "C"));
|
||||
final VariantContext vcMiddle = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 2, Arrays.asList("A", "C"));
|
||||
final VariantContext vcDel = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 4, Arrays.asList("ACG", "A"));
|
||||
final VariantContext vcIns = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 8, Arrays.asList("A", "ACG"));
|
||||
|
||||
final List<VariantContext> allCalls = Arrays.asList(vcStart, vcEnd, vcMiddle, vcDel, vcIns);
|
||||
|
||||
for ( int n = 1; n <= allCalls.size(); n++ ) {
|
||||
for ( final List<VariantContext> calls : Utils.makePermutations(allCalls, n, false) ) {
|
||||
// logger.warn("Executing " + n + " " + calls.size());
|
||||
final RefConfData data = new RefConfData("ACGTAACCGGTT", 0);
|
||||
final List<Haplotype> haplotypes = Arrays.asList(data.getRefHap());
|
||||
for ( int i = 0; i < nReads; i++ ) {
|
||||
data.getActiveRegion().add(data.makeRead(0, data.getRefLength()));
|
||||
}
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
|
||||
|
||||
final List<Integer> expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads);
|
||||
final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
|
||||
checkReferenceModelResult(data, contexts, expectedDPs, calls);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkReferenceModelResult(final RefConfData data, final List<VariantContext> contexts, final List<Integer> expectedDPs, final List<VariantContext> calls) {
|
||||
Assert.assertNotNull(contexts);
|
||||
|
||||
final GenomeLoc loc = data.getActiveRegion().getExtendedLoc();
|
||||
final List<Boolean> seenBP = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), false));
|
||||
|
||||
for ( int i = 0; i < loc.size(); i++ ) {
|
||||
final GenomeLoc curPos = parser.createGenomeLoc(loc.getContig(), loc.getStart() + i);
|
||||
final VariantContext call = model.getOverlappingVariantContext(curPos, calls);
|
||||
final VariantContext refModel = model.getOverlappingVariantContext(curPos, contexts);
|
||||
|
||||
if ( ! data.getActiveRegion().getLocation().containsP(curPos) ) {
|
||||
// part of the extended interval, but not the full interval
|
||||
Assert.assertNull(refModel);
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( call != null ) {
|
||||
Assert.assertEquals(refModel, call, "Should have found call " + call + " but found " + refModel + " instead");
|
||||
} else {
|
||||
final int expectedDP = expectedDPs.get(curPos.getStart() - data.getActiveRegion().getLocation().getStart());
|
||||
Assert.assertEquals(refModel.getStart(), loc.getStart() + i);
|
||||
Assert.assertEquals(refModel.getEnd(), loc.getStart() + i);
|
||||
Assert.assertFalse(refModel.hasLog10PError());
|
||||
Assert.assertEquals(refModel.getAlternateAlleles().size(), 1);
|
||||
Assert.assertEquals(refModel.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE);
|
||||
Assert.assertTrue(refModel.hasGenotype(sample));
|
||||
|
||||
final Genotype g = refModel.getGenotype(sample);
|
||||
Assert.assertTrue(g.hasAD());
|
||||
Assert.assertTrue(g.hasDP());
|
||||
Assert.assertEquals(g.getDP(), expectedDP);
|
||||
Assert.assertTrue(g.hasGQ());
|
||||
Assert.assertTrue(g.hasPL());
|
||||
Assert.assertTrue(g.hasExtendedAttribute(ReferenceConfidenceModel.INDEL_INFORMATIVE_DEPTH));
|
||||
}
|
||||
|
||||
final VariantContext vc = call == null ? refModel : call;
|
||||
if ( curPos.getStart() == vc.getStart() ) {
|
||||
for ( int pos = vc.getStart(); pos <= vc.getEnd(); pos++ ) {
|
||||
final int j = pos - data.getActiveRegion().getLocation().getStart();
|
||||
Assert.assertFalse(seenBP.get(j));
|
||||
seenBP.set(j, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for ( int i = 0; i < seenBP.size(); i++ ) {
|
||||
Assert.assertEquals((boolean)seenBP.get(i), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -312,4 +312,19 @@ public class BaseGraphUnitTest extends BaseTest {
|
|||
|
||||
Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testGetBases() {
|
||||
|
||||
final int kmerSize = 4;
|
||||
final String testString = "AATGGGGGCAATACTA";
|
||||
|
||||
final List<DeBruijnVertex> vertexes = new ArrayList<>();
|
||||
for ( int i = 0; i <= testString.length() - kmerSize; i++ ) {
|
||||
vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize)));
|
||||
}
|
||||
|
||||
final String result = new String(new DeBruijnGraph().getBasesForPath(vertexes));
|
||||
Assert.assertEquals(result, testString.substring(kmerSize - 1));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -166,4 +166,20 @@ public class CommonSuffixMergerUnitTest extends BaseTest {
|
|||
splitter.merge(data.graph, data.v);
|
||||
assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoesntMergeSourceNodes() {
|
||||
final SeqGraph g = new SeqGraph();
|
||||
final SeqVertex v1 = new SeqVertex("A");
|
||||
final SeqVertex v2 = new SeqVertex("A");
|
||||
final SeqVertex v3 = new SeqVertex("A");
|
||||
final SeqVertex top = new SeqVertex("T");
|
||||
final SeqVertex b = new SeqVertex("C");
|
||||
g.addVertices(top, v1, v2, v3, top, b);
|
||||
g.addEdges(top, v1, b);
|
||||
g.addEdges(v2, b); // v2 doesn't have previous node, cannot be merged
|
||||
g.addEdges(top, v3, b);
|
||||
final SharedSequenceMerger merger = new SharedSequenceMerger();
|
||||
Assert.assertFalse(merger.merge(g, b), "Shouldn't be able to merge shared vertices, when one is a source");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,19 +54,29 @@ import org.testng.Assert;
|
|||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class MultiSampleEdgeUnitTest extends BaseTest {
|
||||
|
||||
private class MultiplicityTestProvider {
|
||||
final List<Integer> countsPerSample;
|
||||
final int numSamplesPruning;
|
||||
public MultiplicityTestProvider(final List<Integer> countsPerSample, final int numSamplesPruning) {
|
||||
this.countsPerSample = countsPerSample;
|
||||
this.numSamplesPruning = numSamplesPruning;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "MultiplicityData")
|
||||
public Object[][] makeMultiplicityData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
final List<Integer> countsPerSample = Arrays.asList(0, 1, 2, 3, 4, 5);
|
||||
for ( final int nSamples : Arrays.asList(1, 2, 3, 4, 5)) {
|
||||
for ( final List<Integer> perm : Utils.makePermutations(countsPerSample, nSamples, false) ) {
|
||||
tests.add(new Object[]{perm});
|
||||
for ( final int numSamplesPruning : Arrays.asList(1, 2, 3) ) {
|
||||
for ( final int nSamples : Arrays.asList(1, 2, 3, 4, 5)) {
|
||||
for ( final List<Integer> perm : Utils.makePermutations(countsPerSample, nSamples, false) ) {
|
||||
tests.add(new Object[]{new MultiplicityTestProvider(perm, numSamplesPruning)});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -77,15 +87,15 @@ public class MultiSampleEdgeUnitTest extends BaseTest {
|
|||
* Example testng test using MyDataProvider
|
||||
*/
|
||||
@Test(dataProvider = "MultiplicityData")
|
||||
public void testMultiplicity(final List<Integer> countsPerSample) {
|
||||
final MultiSampleEdge edge = new MultiSampleEdge(false, 0);
|
||||
public void testMultiplicity(final MultiplicityTestProvider cfg) {
|
||||
final MultiSampleEdge edge = new MultiSampleEdge(false, 0, cfg.numSamplesPruning);
|
||||
Assert.assertEquals(edge.getMultiplicity(), 0);
|
||||
Assert.assertEquals(edge.getPruningMultiplicity(), 0);
|
||||
|
||||
int total = 0;
|
||||
for ( int i = 0; i < countsPerSample.size(); i++ ) {
|
||||
for ( int i = 0; i < cfg.countsPerSample.size(); i++ ) {
|
||||
int countForSample = 0;
|
||||
for ( int count = 0; count < countsPerSample.get(i); count++ ) {
|
||||
for ( int count = 0; count < cfg.countsPerSample.get(i); count++ ) {
|
||||
edge.incMultiplicity(1);
|
||||
total++;
|
||||
countForSample++;
|
||||
|
|
@ -95,9 +105,11 @@ public class MultiSampleEdgeUnitTest extends BaseTest {
|
|||
edge.flushSingleSampleMultiplicity();
|
||||
}
|
||||
|
||||
final int max = MathUtils.arrayMax(ArrayUtils.toPrimitive(countsPerSample.toArray(new Integer[countsPerSample.size()])));
|
||||
ArrayList<Integer> counts = new ArrayList<>(cfg.countsPerSample);
|
||||
counts.add(0);
|
||||
Collections.sort(counts);
|
||||
final int prune = counts.get(Math.max(counts.size() - cfg.numSamplesPruning, 0));
|
||||
Assert.assertEquals(edge.getMultiplicity(), total);
|
||||
Assert.assertEquals(edge.getPruningMultiplicity(), max);
|
||||
Assert.assertEquals(edge.getMaxSingleSampleMultiplicity(), max);
|
||||
Assert.assertEquals(edge.getPruningMultiplicity(), prune);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,9 +83,10 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
public SeqGraph assemble() {
|
||||
assembler.removePathsNotConnectedToRef = false; // need to pass some of the tests
|
||||
assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests
|
||||
assembler.setRecoverDanglingTails(false); // needed to pass some of the tests
|
||||
assembler.setDebugGraphTransformations(true);
|
||||
final SeqGraph graph = assembler.assemble(reads, refHaplotype).get(0);
|
||||
final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.<Haplotype>emptyList()).get(0).getGraph();
|
||||
if ( DEBUG ) graph.printGraph(new File("test.dot"), 0);
|
||||
return graph;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,8 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
|
|||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -145,7 +149,136 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO -- update to use determineKmerSizeAndNonUniques directly
|
||||
@Test(enabled = !DEBUG)
|
||||
public void testCyclesInGraph() {
|
||||
|
||||
// b37 20:12655200-12655850
|
||||
final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
|
||||
|
||||
// SNP at 20:12655528 creates a cycle for small kmers
|
||||
final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
|
||||
|
||||
final List<GATKSAMRecord> reads = new ArrayList<>();
|
||||
for ( int index = 0; index < alt.length() - 100; index += 20 )
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M"));
|
||||
|
||||
// test that there are cycles detected for small kmer
|
||||
final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25);
|
||||
rtgraph25.addSequence("ref", ref.getBytes(), null, true);
|
||||
for ( final GATKSAMRecord read : reads )
|
||||
rtgraph25.addRead(read);
|
||||
rtgraph25.buildGraphIfNecessary();
|
||||
Assert.assertTrue(rtgraph25.hasCycles());
|
||||
|
||||
// test that there are no cycles detected for large kmer
|
||||
final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75);
|
||||
rtgraph75.addSequence("ref", ref.getBytes(), null, true);
|
||||
for ( final GATKSAMRecord read : reads )
|
||||
rtgraph75.addRead(read);
|
||||
rtgraph75.buildGraphIfNecessary();
|
||||
Assert.assertFalse(rtgraph75.hasCycles());
|
||||
}
|
||||
|
||||
@Test(enabled = !DEBUG)
|
||||
public void testNsInReadsAreNotUsedForGraph() {
|
||||
|
||||
final int length = 100;
|
||||
final byte[] ref = Utils.dupBytes((byte)'A', length);
|
||||
|
||||
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25);
|
||||
rtgraph.addSequence("ref", ref, null, true);
|
||||
|
||||
// add reads with Ns at any position
|
||||
for ( int i = 0; i < length; i++ ) {
|
||||
final byte[] bases = ref.clone();
|
||||
bases[i] = 'N';
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M");
|
||||
rtgraph.addRead(read);
|
||||
}
|
||||
rtgraph.buildGraphIfNecessary();
|
||||
|
||||
final SeqGraph graph = rtgraph.convertToSequenceGraph();
|
||||
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<>(false);
|
||||
Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1);
|
||||
}
|
||||
|
||||
@DataProvider(name = "DanglingTails")
|
||||
public Object[][] makeDanglingTailsData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
// add 1M to the expected CIGAR because it includes the previous (common) base too
|
||||
tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3}); // incomplete haplotype
|
||||
tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10}); // insertion
|
||||
tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion
|
||||
tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp
|
||||
tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps
|
||||
tests.add(new Object[]{"AAAAA", "C", "1M4D1M", true, -1}); // funky SW alignment
|
||||
tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", true, 1}); // very little data
|
||||
tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch
|
||||
tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "DanglingTails", enabled = !DEBUG)
|
||||
public void testDanglingTails(final String refEnd,
|
||||
final String altEnd,
|
||||
final String cigar,
|
||||
final boolean cigarIsGood,
|
||||
final int mergePointDistanceFromSink) {
|
||||
|
||||
final int kmerSize = 15;
|
||||
|
||||
// construct the haplotypes
|
||||
final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT";
|
||||
final String ref = commonPrefix + refEnd;
|
||||
final String alt = commonPrefix + altEnd;
|
||||
|
||||
// create the graph and populate it
|
||||
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize);
|
||||
rtgraph.addSequence("ref", ref.getBytes(), null, true);
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M");
|
||||
rtgraph.addRead(read);
|
||||
rtgraph.buildGraphIfNecessary();
|
||||
|
||||
// confirm that we have just a single dangling tail
|
||||
MultiDeBruijnVertex altSink = null;
|
||||
for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) {
|
||||
if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) {
|
||||
Assert.assertTrue(altSink == null, "We found more than one non-reference sink");
|
||||
altSink = v;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertTrue(altSink != null, "We did not find a non-reference sink");
|
||||
|
||||
// confirm that the SW alignment agrees with our expectations
|
||||
final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink);
|
||||
Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString());
|
||||
|
||||
// confirm that the goodness of the cigar agrees with our expectations
|
||||
Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar), cigarIsGood);
|
||||
|
||||
// confirm that the tail merging works as expected
|
||||
if ( cigarIsGood ) {
|
||||
final int mergeResult = rtgraph.mergeDanglingTail(result);
|
||||
Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1);
|
||||
|
||||
// confirm that we created the appropriate edge
|
||||
if ( mergePointDistanceFromSink >= 0 ) {
|
||||
MultiDeBruijnVertex v = altSink;
|
||||
for ( int i = 0; i < mergePointDistanceFromSink; i++ ) {
|
||||
if ( rtgraph.inDegreeOf(v) != 1 )
|
||||
Assert.fail("Encountered vertex with multiple sources");
|
||||
v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v));
|
||||
}
|
||||
Assert.assertTrue(rtgraph.outDegreeOf(v) > 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// TODO -- update to use determineKmerSizeAndNonUniques directly
|
||||
// @DataProvider(name = "KmerSizeData")
|
||||
// public Object[][] makeKmerSizeDataProvider() {
|
||||
// List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
|
|
|||
|
|
@ -66,9 +66,10 @@ public class ConstrainedMateFixingManagerUnitTest extends BaseTest {
|
|||
|
||||
@BeforeClass
|
||||
public void beforeClass() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 100);
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000);
|
||||
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSecondaryAlignmentsDoNotInterfere() {
|
||||
final List<GATKSAMRecord> properReads = ArtificialSAMUtils.createPair(header, "foo", 1, 10, 30, true, false);
|
||||
|
|
@ -105,4 +106,29 @@ public class ConstrainedMateFixingManagerUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSecondaryAlignmentsDoNotCauseAccidentalRemovalOfMate() {
|
||||
final List<GATKSAMRecord> properReads = ArtificialSAMUtils.createPair(header, "foo", 1, 530, 1594, true, false);
|
||||
final GATKSAMRecord read1 = properReads.get(0);
|
||||
read1.setFlags(99); // first in proper pair, mate negative strand
|
||||
|
||||
final GATKSAMRecord read2Primary = properReads.get(1);
|
||||
read2Primary.setFlags(147); // second in pair, mate unmapped, not primary alignment
|
||||
read2Primary.setAlignmentStart(1596); // move the read
|
||||
|
||||
final GATKSAMRecord read2NonPrimary = new GATKSAMRecord(read2Primary);
|
||||
read2NonPrimary.setReadName("foo");
|
||||
read2NonPrimary.setFlags(393); // second in proper pair, on reverse strand
|
||||
read2NonPrimary.setAlignmentStart(451);
|
||||
read2NonPrimary.setMateAlignmentStart(451);
|
||||
|
||||
final ConstrainedMateFixingManager manager = new ConstrainedMateFixingManager(null, genomeLocParser, 10000, 200, 10000);
|
||||
manager.addRead(read2NonPrimary, false, false);
|
||||
manager.addRead(read1, false, false);
|
||||
|
||||
for ( int i = 0; i < ConstrainedMateFixingManager.EMIT_FREQUENCY; i++ )
|
||||
manager.addRead(ArtificialSAMUtils.createArtificialRead(header, "foo" + i, 0, 1500, 10), false, false);
|
||||
|
||||
Assert.assertTrue(manager.forMateMatching.containsKey("foo"));
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue