Merge pull request #264 from broadinstitute/md_dbsnp

Make HaplotypeCaller annotate dbSNP rsIDs
This commit is contained in:
Mark DePristo 2013-06-10 13:38:20 -07:00
commit c7836ec746
25 changed files with 773 additions and 316 deletions

View File

@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
@ -146,6 +147,7 @@ public class GenotypingEngine {
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final RefMetaDataTracker tracker,
final List<VariantContext> activeAllelesToGenotype ) {
// sanity check input arguments
if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
@ -204,7 +206,7 @@ public class GenotypingEngine {
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call);
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call);
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);

View File

@ -441,7 +441,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private final static int MIN_READ_LENGTH = 10;
private List<String> samplesList = new ArrayList<String>();
private final List<VariantContext> allelesToGenotype = new ArrayList<VariantContext>();
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
private final static Allele FAKE_ALT_ALLELE = Allele.create("<FAKE_ALT>", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
@ -596,7 +595,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles);
if( vcFromAllelesRod != null ) {
allelesToGenotype.add(vcFromAllelesRod); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object
return new ActivityProfileState(ref.getLocus(), 1.0);
}
}
@ -664,12 +662,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final List<VariantContext> activeAllelesToGenotype = new ArrayList<>();
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
for( final VariantContext vc : allelesToGenotype ) {
if( originalActiveRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) {
for ( final VariantContext vc : metaDataTracker.getValues(UG_engine.getUAC().alleles) ) {
if ( vc.isNotFiltered() ) {
activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode
}
}
allelesToGenotype.removeAll( activeAllelesToGenotype );
// No alleles found in this region so nothing to do!
if ( activeAllelesToGenotype.isEmpty() ) { return NO_CALLS; }
} else {
@ -704,6 +701,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
assemblyResult.paddedReferenceLoc,
assemblyResult.regionForGenotyping.getLocation(),
getToolkit().getGenomeLocParser(),
metaDataTracker,
activeAllelesToGenotype );
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
@ -890,8 +888,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Override
public Integer reduce(List<VariantContext> callsInRegion, Integer numCalledRegions) {
for( final VariantContext call : callsInRegion ) {
// TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker.
// annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call);
vcfWriter.add( call );
}
return (callsInRegion.isEmpty() ? 0 : 1) + numCalledRegions;

View File

@ -0,0 +1,164 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
public class VariantOverlapAnnotatorUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private IndexedFastaSequenceFile seq;
@BeforeClass
public void setup() throws FileNotFoundException {
// sequence
seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
genomeLocParser = new GenomeLocParser(seq);
}
private VariantContext makeVC(final String source, final String id, final List<String> alleles) {
final VariantContext vc = GATKVariantContextUtils.makeFromAlleles(source, "20", 10, alleles);
return new VariantContextBuilder(vc).id(id).make();
}
private VariantOverlapAnnotator makeAnnotator(final String dbSNP, final String ... overlaps) {
final RodBinding<VariantContext> dbSNPBinding = dbSNP == null ? null : new RodBinding<>(VariantContext.class, dbSNP);
final Map<RodBinding<VariantContext>, String> overlapBinding = new LinkedHashMap<>();
for ( final String overlap : overlaps ) overlapBinding.put(new RodBinding<>(VariantContext.class, overlap), overlap);
return new VariantOverlapAnnotator(dbSNPBinding, overlapBinding, genomeLocParser);
}
@Test
public void testCreateWithSpecialNames() {
final List<String> names = Arrays.asList("X", "Y", "Z");
final Map<RodBinding<VariantContext>, String> overlapBinding = new LinkedHashMap<>();
for ( final String overlap : names ) overlapBinding.put(new RodBinding<>(VariantContext.class, overlap + "Binding"), overlap);
final VariantOverlapAnnotator annotator = new VariantOverlapAnnotator(null, overlapBinding, genomeLocParser);
Assert.assertEquals(annotator.getOverlapNames(), names);
}
@DataProvider(name = "AnnotateRsIDData")
public Object[][] makeAnnotateRsIDData() {
List<Object[]> tests = new ArrayList<>();
// this functionality can be adapted to provide input data for whatever you might want in your data
final VariantContext callNoIDAC = makeVC("call", VCFConstants.EMPTY_ID_FIELD, Arrays.asList("A", "C"));
final VariantContext callNoIDAT = makeVC("call", VCFConstants.EMPTY_ID_FIELD, Arrays.asList("A", "T"));
final VariantContext callIDAC = makeVC("call", "foo", Arrays.asList("A", "C"));
final VariantContext callExistingIDAC = makeVC("call", "rsID1", Arrays.asList("A", "C"));
final VariantContext dbSNP_AC = makeVC("DBSNP", "rsID1", Arrays.asList("A", "C"));
final VariantContext dbSNP_AT = makeVC("DBSNP", "rsID2", Arrays.asList("A", "T"));
final VariantContext dbSNP_AG = makeVC("DBSNP", "rsID3", Arrays.asList("A", "G"));
final VariantContext dbSNP_AC_AT = makeVC("DBSNP", "rsID1;rsID2", Arrays.asList("A", "C", "T"));
final VariantContext dbSNP_AC_AG = makeVC("DBSNP", "rsID1;rsID3", Arrays.asList("A", "C", "G"));
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC), dbSNP_AC.getID(), true});
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AT), VCFConstants.EMPTY_ID_FIELD, false});
tests.add(new Object[]{callIDAC, Arrays.asList(dbSNP_AC), "foo" + ";" + dbSNP_AC.getID(), true});
tests.add(new Object[]{callIDAC, Arrays.asList(dbSNP_AT), "foo", false});
tests.add(new Object[]{callExistingIDAC, Arrays.asList(dbSNP_AC), "rsID1", true});
tests.add(new Object[]{callExistingIDAC, Arrays.asList(dbSNP_AT), "rsID1", false});
final VariantContext callNoIDACT = makeVC("call", VCFConstants.EMPTY_ID_FIELD, Arrays.asList("A", "C", "T"));
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AC), dbSNP_AC.getID(), true});
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AT), dbSNP_AT.getID(), true});
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AG), VCFConstants.EMPTY_ID_FIELD, false});
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AC_AT), dbSNP_AC_AT.getID(), true});
tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AC_AG), dbSNP_AC_AG.getID(), true});
// multiple options
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC, dbSNP_AT), "rsID1", true});
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AT, dbSNP_AC), "rsID1", true});
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC_AT), "rsID1;rsID2", true});
tests.add(new Object[]{callNoIDAT, Arrays.asList(dbSNP_AC_AT), "rsID1;rsID2", true});
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC_AG), "rsID1;rsID3", true});
tests.add(new Object[]{callNoIDAT, Arrays.asList(dbSNP_AC_AG), VCFConstants.EMPTY_ID_FIELD, false});
final VariantContext dbSNP_AC_FAIL = new VariantContextBuilder(makeVC("DBSNP", "rsID1", Arrays.asList("A", "C"))).filter("FAIL").make();
tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC_FAIL), VCFConstants.EMPTY_ID_FIELD, false});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "AnnotateRsIDData")
public void testAnnotateRsID(final VariantContext toAnnotate, final List<VariantContext> dbSNPRecords, final String expectedID, final boolean expectOverlap) throws Exception {
final VariantOverlapAnnotator annotator = makeAnnotator("dbnsp");
final VariantContext annotated = annotator.annotateRsID(dbSNPRecords, toAnnotate);
Assert.assertNotNull(annotated);
Assert.assertEquals(annotated.getID(), expectedID);
}
@Test(dataProvider = "AnnotateRsIDData")
public void testAnnotateOverlaps(final VariantContext toAnnotate, final List<VariantContext> records, final String expectedID, final boolean expectOverlap) throws Exception {
final String name = "binding";
final VariantOverlapAnnotator annotator = makeAnnotator(null, name);
final VariantContext annotated = annotator.annotateOverlap(records, name, toAnnotate);
Assert.assertNotNull(annotated);
Assert.assertEquals(annotated.getID(), toAnnotate.getID(), "Shouldn't modify annotation");
Assert.assertEquals(annotated.hasAttribute(name), expectOverlap);
if ( expectOverlap ) {
Assert.assertEquals(annotated.getAttribute(name), true);
}
}
}

View File

@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("19f77f557150905ef3fa4713f611a1b9"));
Arrays.asList("14ad6eeed46e9b6f4757370267b1a1cc"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("bb3dbad9666ebf38d338f0c9c211a42e"));
Arrays.asList("cd184a2a5a1932dcf3e8f0424652176b"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("8052390ca2b6a57c3ddf379a51225d64"));
Arrays.asList("e8d98996eb81ece8cfb52437920ae2e0"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("b6b9dba97fbabaeeb458a41051983e7b"));
Arrays.asList("23a78c16f64bffe1dea3a5587fcabdad"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("38730c7030271f5d0ca0b59365d57814"));
Arrays.asList("294183823d678d3668f4fa98b4de6e06"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("264325878b988acc11d8e5d9d2ba0b7f"));
Arrays.asList("e90256acfc360fc4bf377094732a673a"));
executeTest("test minIndelFraction 0.0", spec);
}

View File

@ -80,7 +80,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("698e54aeae3130779d246b9480a4052c"));
Arrays.asList("60115af273fde49c76d4df6c9c0f6501"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}

View File

@ -47,15 +47,12 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broad.tribble.TribbleIndexedFeatureReader;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFCodec;
import org.testng.annotations.Test;
import java.io.File;
@ -69,6 +66,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam";
final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam";
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
@ -199,4 +197,27 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
Arrays.asList("86bdd07a3ac4f6ce239c30efea8bf5ba"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// test dbSNP annotation
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("7b23a288a31cafca3946f14f2381e7cb"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@Test
public void HCTestDBSNPAnnotationWEx() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("9587029b702bb59bd4dfec69eac4c210"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
}

View File

@ -72,8 +72,6 @@ class IntervalOverlappingRODsFromStream {
/**
* Get the list of RODs overlapping loc from this stream of RODs.
*
* Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart
*
* @param loc the interval to query
* @return a non-null RODRecordList containing the overlapping RODs, which may be empty
*/
@ -84,7 +82,6 @@ class IntervalOverlappingRODsFromStream {
if ( lastQuery != null && loc.getStart() < lastQuery.getStart() )
throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery));
trimCurrentFeaturesToLoc(loc);
readOverlappingFutureFeatures(loc);
return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc);
}
@ -128,11 +125,14 @@ class IntervalOverlappingRODsFromStream {
/**
* Update function. Remove all elements of currentFeatures that end before loc
*
* Must be called by clients periodically when they know they they will never ask for data before
* loc, so that the running cache of RODs doesn't grow out of control.
*
* @param loc the location to use
*/
@Requires("loc != null")
@Ensures("currentFeatures.size() <= old(currentFeatures.size())")
private void trimCurrentFeaturesToLoc(final GenomeLoc loc) {
public void trimCurrentFeaturesToLoc(final GenomeLoc loc) {
final ListIterator<GATKFeature> it = currentFeatures.listIterator();
while ( it.hasNext() ) {
final GATKFeature feature = it.next();

View File

@ -0,0 +1,184 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.providers;
import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.reads.ReadShard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* a ROD view that allows for requests for RODs that overlap intervals on the genome to produce a RefMetaDataTracker
*/
public class IntervalReferenceOrderedView implements ReferenceOrderedView {
/** a list of the RMDDataState (location->iterators) */
private final List<RMDDataState> states = new ArrayList<>(1);
/**
* Used to get genome locs for reads
*/
protected final GenomeLocParser genomeLocParser;
/**
* The total extent of all reads in this span. We create iterators from our RODs
* from the start of this span, to the end.
*/
private final GenomeLoc shardSpan;
/**
* Create a new IntervalReferenceOrderedView taking data from provider and capable of
* servicing ROD overlap requests within the genomic interval span
*
* @param provider a ShardDataProvider to give us data
* @param span a GenomeLoc span, or null indicating take the entire genome
*/
public IntervalReferenceOrderedView(final ShardDataProvider provider, final GenomeLoc span) {
if ( provider == null ) throw new IllegalArgumentException("provider cannot be null");
if ( provider.hasReferenceOrderedData() && span == null ) throw new IllegalArgumentException("span cannot be null when provider has reference ordered data");
this.genomeLocParser = provider.getGenomeLocParser();
this.shardSpan = span;
provider.register(this);
// conditional to optimize the case where we don't have any ROD data
if ( provider.hasReferenceOrderedData() && ! shardSpan.isUnmapped() ) {
for (final ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan)));
}
}
/**
* Testing constructor
*/
protected IntervalReferenceOrderedView(final GenomeLocParser genomeLocParser,
final GenomeLoc shardSpan,
final List<String> names,
final List<PeekableIterator<RODRecordList>> featureSources) {
this.genomeLocParser = genomeLocParser;
this.shardSpan = shardSpan;
for ( int i = 0; i < names.size(); i++ )
states.add(new RMDDataState(names.get(i), featureSources.get(i)));
}
public Collection<Class<? extends View>> getConflictingViews() {
List<Class<? extends View>> classes = new ArrayList<>();
classes.add(ManagingReferenceOrderedView.class);
return classes;
}
/**
* Get a RefMetaDataTracker containing bindings for all RODs overlapping the start position of loc
* @param loc a GenomeLoc of size == 1
* @return a non-null RefMetaDataTracker
*/
@Override
public RefMetaDataTracker getReferenceOrderedDataAtLocus(GenomeLoc loc) {
if ( loc == null ) throw new IllegalArgumentException("loc cannot be null");
if ( loc.size() != 1 ) throw new IllegalArgumentException("GenomeLoc must have size == 1 but got " + loc);
return getReferenceOrderedDataForInterval(loc);
}
/**
* Get a RefMetaDataTracker containing bindings for all RODs overlapping interval
*
* @param interval a non=null interval
* @return a non-null RefMetaDataTracker
*/
public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) {
if ( interval == null ) throw new IllegalArgumentException("Interval cannot be null");
if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers)
return RefMetaDataTracker.EMPTY_TRACKER;
else {
final List<RODRecordList> bindings = new ArrayList<>(states.size());
for ( final RMDDataState state : states )
bindings.add(state.stream.getOverlapping(interval));
return new RefMetaDataTracker(bindings);
}
}
/**
* Trim down all of the ROD managers so that they only hold ROD bindings wit start >= startOfDataToKeep.getStart()
*
* @param startOfDataToKeep a non-null genome loc
*/
public void trimCurrentFeaturesToLoc(final GenomeLoc startOfDataToKeep) {
if ( startOfDataToKeep == null ) throw new IllegalArgumentException("startOfDataToKeep cannot be null");
for ( final RMDDataState state : states )
state.stream.trimCurrentFeaturesToLoc(startOfDataToKeep);
}
/**
* Closes the current view.
*/
public void close() {
for (final RMDDataState state : states)
state.close();
// Clear out the existing data so that post-close() accesses to this data will fail-fast.
states.clear();
}
/**
* Models the traversal state of a given ROD lane.
*/
private static class RMDDataState {
public final ReferenceOrderedDataSource dataSource;
public final IntervalOverlappingRODsFromStream stream;
private final LocationAwareSeekableRODIterator iterator;
public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
this.dataSource = dataSource;
this.iterator = iterator;
this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<>(iterator));
}
/**
* For testing
*/
public RMDDataState(final String name, final PeekableIterator<RODRecordList> iterator) {
this.dataSource = null;
this.iterator = null;
this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<>(iterator));
}
public void close() {
if ( dataSource != null )
dataSource.close( iterator );
}
}
}

View File

@ -76,7 +76,8 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView {
* @param loc Locus at which to track.
* @return A tracker containing information about this locus.
*/
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) {
@Override
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) {
if ( states.isEmpty() )
return RefMetaDataTracker.EMPTY_TRACKER;
else {

View File

@ -42,52 +42,9 @@ import java.util.Collection;
import java.util.List;
/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */
public class ReadBasedReferenceOrderedView implements View {
// a list of the RMDDataState (location->iterators)
private final List<RMDDataState> states = new ArrayList<RMDDataState>(1);
private final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker();
/**
* Used to get genome locs for reads
*/
private final GenomeLocParser genomeLocParser;
/**
* The total extent of all reads in this span. We create iterators from our RODs
* from the start of this span, to the end.
*/
private final GenomeLoc shardSpan;
public class ReadBasedReferenceOrderedView extends IntervalReferenceOrderedView {
public ReadBasedReferenceOrderedView(final ShardDataProvider provider) {
this.genomeLocParser = provider.getGenomeLocParser();
// conditional to optimize the case where we don't have any ROD data
this.shardSpan = provider.getReferenceOrderedData() != null ? ((ReadShard)provider.getShard()).getReadsSpan() : null;
provider.register(this);
if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) {
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan)));
}
}
/**
* Testing constructor
*/
protected ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser,
final GenomeLoc shardSpan,
final List<String> names,
final List<PeekableIterator<RODRecordList>> featureSources) {
this.genomeLocParser = genomeLocParser;
this.shardSpan = shardSpan;
for ( int i = 0; i < names.size(); i++ )
states.add(new RMDDataState(names.get(i), featureSources.get(i)));
}
public Collection<Class<? extends View>> getConflictingViews() {
List<Class<? extends View>> classes = new ArrayList<Class<? extends View>>();
classes.add(ManagingReferenceOrderedView.class);
return classes;
super(provider, provider.hasReferenceOrderedData() ? ((ReadShard)provider.getShard()).getReadsSpan() : null);
}
/**
@ -101,60 +58,11 @@ public class ReadBasedReferenceOrderedView implements View {
@Ensures("result != null")
public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) {
if ( rec.getReadUnmappedFlag() )
// empty RODs for unmapped reads
return new RefMetaDataTracker();
else
return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec));
}
@Requires({"interval != null", "shardSpan == null || shardSpan.isUnmapped() || shardSpan.containsP(interval)"})
@Ensures("result != null")
public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) {
if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers)
return EMPTY_TRACKER;
return RefMetaDataTracker.EMPTY_TRACKER;
else {
final List<RODRecordList> bindings = new ArrayList<RODRecordList>(states.size());
for ( final RMDDataState state : states )
bindings.add(state.stream.getOverlapping(interval));
return new RefMetaDataTracker(bindings);
}
}
/**
* Closes the current view.
*/
public void close() {
for (final RMDDataState state : states)
state.close();
// Clear out the existing data so that post-close() accesses to this data will fail-fast.
states.clear();
}
/** Models the traversal state of a given ROD lane. */
private static class RMDDataState {
public final ReferenceOrderedDataSource dataSource;
public final IntervalOverlappingRODsFromStream stream;
private final LocationAwareSeekableRODIterator iterator;
public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
this.dataSource = dataSource;
this.iterator = iterator;
this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<RODRecordList>(iterator));
}
/**
* For testing
*/
public RMDDataState(final String name, final PeekableIterator<RODRecordList> iterator) {
this.dataSource = null;
this.iterator = null;
this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<RODRecordList>(iterator));
}
public void close() {
if ( dataSource != null )
dataSource.close( iterator );
final GenomeLoc readSpan = genomeLocParser.createGenomeLoc(rec);
trimCurrentFeaturesToLoc(readSpan);
return getReferenceOrderedDataForInterval(readSpan);
}
}
}

View File

@ -25,10 +25,9 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
public interface ReferenceOrderedView extends View {
RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext refContext );
RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc );
}

View File

@ -98,7 +98,8 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
rodQueue = new RODMergingIterator(iterators);
}
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) {
@Override
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) {
// special case the interval again -- add it into the ROD
if ( interval != null ) { allTracksHere.add(interval); }
return new RefMetaDataTracker(allTracksHere);

View File

@ -37,7 +37,6 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
@ -114,12 +113,6 @@ public class LinearMicroScheduler extends MicroScheduler {
done = walker.isDone();
}
// Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine
if( traversalEngine instanceof TraverseActiveRegions) {
final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit());
accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator
}
Object result = accumulator.finishTraversal();
outputTracker.close();

View File

@ -29,14 +29,12 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionTraversalParameters;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
@ -92,12 +90,26 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
private TAROrderedReadCache myReads = null;
private GenomeLoc lastRegionProcessed = null;
private GenomeLoc spanOfLastReadSeen = null;
private ActivityProfile activityProfile = null;
int maxReadsInMemory = 0;
ActiveRegionWalker<M, T> walker;
final NanoScheduler<ActiveRegion, M, T> nanoScheduler;
final NanoScheduler<MapData, M, T> nanoScheduler;
/**
* Data to use in the ActiveRegionWalker.map function produced by the NanoScheduler input iterator
*/
private static class MapData {
public ActiveRegion activeRegion;
public RefMetaDataTracker tracker;
private MapData(ActiveRegion activeRegion, RefMetaDataTracker tracker) {
this.activeRegion = activeRegion;
this.tracker = tracker;
}
}
/**
* Create a single threaded active region traverser
@ -112,12 +124,12 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
*/
public TraverseActiveRegions(final int nThreads) {
nanoScheduler = new NanoScheduler<>(nThreads);
nanoScheduler.setProgressFunction(new NSProgressFunction<ActiveRegion>() {
nanoScheduler.setProgressFunction(new NSProgressFunction<MapData>() {
@Override
public void progress(ActiveRegion lastActiveRegion) {
public void progress(MapData lastActiveRegion) {
if ( lastActiveRegion != null )
// note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon
printProgress(lastActiveRegion.getLocation().getStopLocation());
printProgress(lastActiveRegion.activeRegion.getLocation().getStopLocation());
}
});
}
@ -149,13 +161,6 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma);
if ( walkerHasPresetRegions ) {
// we load all of the preset locations into the workQueue
for ( final GenomeLoc loc : this.walker.getPresetActiveRegions()) {
workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension()));
}
}
final int maxReadsAcrossSamples = annotation.maxReadsToHoldInMemoryPerSample() * SampleUtils.getSAMFileSamples(engine).size();
final int maxReadsToHoldInMemory = Math.min(maxReadsAcrossSamples, annotation.maxReadsToHoldTotal());
myReads = new TAROrderedReadCache(maxReadsToHoldInMemory);
@ -167,6 +172,24 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
//
// -------------------------------------------------------------------------------------
/**
* Load in the preset regions for contig into workQueue
*
* Should be called before starting to process work on contig
*
* Can only be called when walkerHasPresetRegions is true or an IllegalStateException will be thrown
*
* @param contig the contig we are about to process
*/
protected void loadPresetRegionsForContigToWorkQueue(final String contig) {
if ( ! walkerHasPresetRegions ) throw new IllegalStateException("only appropriate to call when walker has preset regions");
final GenomeLoc contigSpan = engine.getGenomeLocParser().createOverEntireContig(contig);
for ( final GenomeLoc loc : this.walker.getPresetActiveRegions().getOverlapping(contigSpan) ) {
workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension()));
}
}
protected int getActiveRegionExtension() {
return activeRegionExtension;
}
@ -198,16 +221,6 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc);
}
protected ReferenceOrderedView getReferenceOrderedView(final ActiveRegionWalker<M, T> walker,
final LocusShardDataProvider dataProvider,
final LocusView locusView) {
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
return new ManagingReferenceOrderedView( dataProvider );
else
return (RodLocusView)locusView;
}
// -------------------------------------------------------------------------------------
//
// Actual traverse function
@ -254,7 +267,7 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
logger.info(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
nanoScheduler.setDebug(false);
final Iterator<ActiveRegion> activeRegionIterator = new ActiveRegionIterator(dataProvider);
final Iterator<MapData> activeRegionIterator = new ActiveRegionIterator(dataProvider);
final TraverseActiveRegionMap myMap = new TraverseActiveRegionMap();
final TraverseActiveRegionReduce myReduce = new TraverseActiveRegionReduce();
final T result = nanoScheduler.execute(activeRegionIterator, myMap, sum, myReduce);
@ -262,29 +275,53 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
return result;
}
private class ActiveRegionIterator implements Iterator<ActiveRegion> {
private class ActiveRegionIterator implements Iterator<MapData> {
private final LocusShardDataProvider dataProvider;
private LinkedList<ActiveRegion> readyActiveRegions = new LinkedList<ActiveRegion>();
private LinkedList<MapData> readyActiveRegions = new LinkedList<>();
private boolean done = false;
private final LocusView locusView;
private final LocusReferenceView referenceView;
private final ReferenceOrderedView referenceOrderedDataView;
private final GenomeLoc locOfLastReadAtTraversalStart;
private final IntervalReferenceOrderedView referenceOrderedDataView;
private final GenomeLoc currentWindow;
private final boolean processRemainingActiveRegions;
public ActiveRegionIterator( final LocusShardDataProvider dataProvider ) {
this.dataProvider = dataProvider;
locusView = new AllLocusView(dataProvider);
referenceView = new LocusReferenceView( walker, dataProvider );
referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// The data shard may carry a number of locations to process (due to being indexed together).
// This value is just the interval we are processing within the entire provider
currentWindow = dataProvider.getLocus();
final int currentWindowPos = dataProvider.getShard().getGenomeLocs().indexOf(currentWindow);
if ( currentWindowPos == -1 ) throw new IllegalStateException("Data provider " + dataProvider + " didn't have our current window in it " + currentWindow);
processRemainingActiveRegions = currentWindowPos == dataProvider.getShard().getGenomeLocs().size() - 1;
// the rodSpan covers all of the bases in the activity profile, including all of the bases
// through the current window interval. This is because we may issue a query to get data for an
// active region spanning before the current interval as far back as the start of the current profile,
// if we have pending work to do that finalizes in this interval.
final GenomeLoc rodSpan = activityProfile.getSpan() == null ? currentWindow : activityProfile.getSpan().endpointSpan(currentWindow);
if ( ! dataProvider.getShard().getLocation().containsP(rodSpan) ) throw new IllegalStateException("Rod span " + rodSpan + " isn't contained within the data shard " + dataProvider.getShard().getLocation() + ", meaning we wouldn't get all of the data we need");
referenceOrderedDataView = new IntervalReferenceOrderedView( dataProvider, rodSpan );
// We keep processing while the next reference location is within the interval
locOfLastReadAtTraversalStart = spanOfLastSeenRead();
// load in the workQueue the present regions that span the current contig, if it's different from the last one
if ( walkerHasPresetRegions && ( lastRegionProcessed == null || ! currentWindow.onSameContig(lastRegionProcessed)) ) {
loadPresetRegionsForContigToWorkQueue(currentWindow.getContig());
}
// remember the last region we processed for sanity checking later
lastRegionProcessed = currentWindow;
}
@Override public void remove() { throw new UnsupportedOperationException("Cannot remove from ActiveRegionIterator"); }
@Override
public ActiveRegion next() {
public MapData next() {
return readyActiveRegions.pop();
}
@Override
@ -326,7 +363,7 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
final boolean flushProfile = ! activityProfile.isEmpty()
&& ( activityProfile.getContigIndex() != location.getContigIndex()
|| location.getStart() != activityProfile.getStop() + 1);
final List<ActiveRegion> newActiveRegions = prepActiveRegionsForProcessing(walker, flushProfile, false);
final List<MapData> newActiveRegions = prepActiveRegionsForProcessing(walker, flushProfile, false, referenceOrderedDataView);
dataProvider.getShard().getReadMetrics().incrementNumIterations();
@ -335,7 +372,7 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation());
// Call the walkers isActive function for this locus and add them to the list to be integrated later
addIsActiveResult(walker, tracker, refContext, locus);
@ -346,29 +383,25 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
if ( ! newActiveRegions.isEmpty() ) {
readyActiveRegions.addAll(newActiveRegions);
if ( DEBUG )
for ( final ActiveRegion region : newActiveRegions )
logger.info("Adding region to queue for processing " + region);
for ( final MapData region : newActiveRegions )
logger.info("Adding region to queue for processing " + region.activeRegion);
return true;
}
}
return false;
if ( processRemainingActiveRegions ) {
// we've run out of stuff to process, and since shards now span entire contig boundaries
// we should finalized our regions. This allows us to continue to use our referenceOrderedDataView
// which would otherwise be shutdown. Only followed when the microschedule says that we're
// inside of the last window in the current shard
readyActiveRegions.addAll(prepActiveRegionsForProcessing(walker, true, true, referenceOrderedDataView));
}
return ! readyActiveRegions.isEmpty();
}
}
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public T endTraversal(final Walker<M, T> walker, T sum) {
for ( final ActiveRegion region : prepActiveRegionsForProcessing((ActiveRegionWalker<M, T>)walker, true, true) ) {
final M x = ((ActiveRegionWalker<M, T>) walker).map(region, null);
sum = walker.reduce( x, sum );
}
return sum;
}
// -------------------------------------------------------------------------------------
//
// Functions to manage and interact with the live / dead zone
@ -594,7 +627,10 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
* add these blocks of work to the work queue
* band-pass filter the list of isActive probabilities and turn into active regions
*/
private List<ActiveRegion> prepActiveRegionsForProcessing(final ActiveRegionWalker<M, T> walker, final boolean flushActivityProfile, final boolean forceAllRegionsToBeActive) {
private List<MapData> prepActiveRegionsForProcessing(final ActiveRegionWalker<M, T> walker,
final boolean flushActivityProfile,
final boolean forceAllRegionsToBeActive,
final IntervalReferenceOrderedView referenceOrderedDataView) {
if ( ! walkerHasPresetRegions ) {
// We don't have preset regions, so we get our regions from the activity profile
final Collection<ActiveRegion> activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMinRegionSize(), getMaxRegionSize(), flushActivityProfile);
@ -603,13 +639,13 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
}
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
final LinkedList<ActiveRegion> readyRegions = new LinkedList<ActiveRegion>();
final LinkedList<MapData> readyRegions = new LinkedList<>();
while( workQueue.peek() != null ) {
final ActiveRegion activeRegion = workQueue.peek();
if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
writeActivityProfile(activeRegion.getSupportingStates());
writeActiveRegion(activeRegion);
readyRegions.add(prepActiveRegionForProcessing(workQueue.remove(), walker));
readyRegions.add(prepActiveRegionForProcessing(workQueue.remove(), walker, referenceOrderedDataView));
} else {
break;
}
@ -619,8 +655,10 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
}
private ActiveRegion prepActiveRegionForProcessing(final ActiveRegion activeRegion, final ActiveRegionWalker<M, T> walker) {
final List<GATKSAMRecord> stillLive = new LinkedList<GATKSAMRecord>();
private MapData prepActiveRegionForProcessing(final ActiveRegion activeRegion,
final ActiveRegionWalker<M, T> walker,
final IntervalReferenceOrderedView referenceOrderedDataView) {
final List<GATKSAMRecord> stillLive = new LinkedList<>();
for ( final GATKSAMRecord read : myReads.popCurrentReads() ) {
boolean killed = false;
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
@ -653,14 +691,21 @@ public final class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Activ
logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s",
activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive(), activeRegion.size(), maxReadsInMemory));
return activeRegion;
// prepare the RefMetaDataTracker information
final GenomeLoc loc = activeRegion.getLocation();
// get all of the RODs that cover the active region (without extension)
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataForInterval(loc);
// trim away all of the features that occurred before this location, as we will not need them in the future
referenceOrderedDataView.trimCurrentFeaturesToLoc(loc);
return new MapData(activeRegion, tracker);
}
private class TraverseActiveRegionMap implements NSMapFunction<ActiveRegion, M> {
private class TraverseActiveRegionMap implements NSMapFunction<MapData, M> {
@Override
public M apply(final ActiveRegion activeRegion) {
if ( DEBUG ) logger.info("Executing walker.map for " + activeRegion + " in thread " + Thread.currentThread().getName());
return walker.map(activeRegion, null);
public M apply(final MapData mapData) {
if ( DEBUG ) logger.info("Executing walker.map for " + mapData.activeRegion + " in thread " + Thread.currentThread().getName());
return walker.map(mapData.activeRegion, mapData.tracker);
}
}

View File

@ -179,7 +179,7 @@ public class TraverseLociNano<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext);
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location);
numIterations++;
return new MapData(locus, refContext, tracker);

View File

@ -180,9 +180,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality threshold in order to annotate mendelian violation ratio")
public double minGenotypeQualityP = 0.0;
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp tracks that exactly match both reference and alternate alleles will be counted as concordant", required=false)
protected boolean requireStrictAlleleMatch = false;
private VariantAnnotatorEngine engine;
/**
@ -204,7 +201,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
else
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit());
engine.initializeExpressions(expressionsToUse);
engine.setRequireStrictAlleleMatch(requireStrictAlleleMatch);
// setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones

View File

@ -34,26 +34,23 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.vcf.*;
import java.util.*;
public class VariantAnnotatorEngine {
private List<InfoFieldAnnotation> requestedInfoAnnotations = Collections.emptyList();
private List<GenotypeAnnotation> requestedGenotypeAnnotations = Collections.emptyList();
private List<VAExpression> requestedExpressions = new ArrayList<VAExpression>();
private List<VAExpression> requestedExpressions = new ArrayList<>();
private final HashMap<RodBinding<VariantContext>, String> dbAnnotations = new HashMap<RodBinding<VariantContext>, String>();
private final AnnotatorCompatible walker;
private final GenomeAnalysisEngine toolkit;
private boolean requireStrictAlleleMatch = false;
VariantOverlapAnnotator variantOverlapAnnotator = null;
protected static class VAExpression {
@ -85,7 +82,7 @@ public class VariantAnnotatorEngine {
requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations();
requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations();
excludeAnnotations(annotationsToExclude);
initializeDBs();
initializeDBs(toolkit);
}
// use this constructor if you want to select specific annotations (and/or interfaces)
@ -93,14 +90,7 @@ public class VariantAnnotatorEngine {
this.walker = walker;
this.toolkit = toolkit;
initializeAnnotations(annotationGroupsToUse, annotationsToUse, annotationsToExclude);
initializeDBs();
}
// experimental constructor for active region traversal
public VariantAnnotatorEngine(GenomeAnalysisEngine toolkit) {
this.walker = null;
this.toolkit = toolkit;
requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(Arrays.asList("ActiveRegionBasedAnnotation"), Collections.<String>emptyList());
initializeDBs(toolkit);
}
// select specific expressions to use
@ -138,16 +128,19 @@ public class VariantAnnotatorEngine {
requestedGenotypeAnnotations = tempRequestedGenotypeAnnotations;
}
private void initializeDBs() {
private void initializeDBs(final GenomeAnalysisEngine engine) {
// check to see whether comp rods were included
final RodBinding<VariantContext> dbsnp = walker.getDbsnpRodBinding();
if ( dbsnp != null && dbsnp.isBound() )
dbAnnotations.put(dbsnp, VCFConstants.DBSNP_KEY);
RodBinding<VariantContext> dbSNPBinding = walker.getDbsnpRodBinding();
if ( dbSNPBinding != null && ! dbSNPBinding.isBound() )
dbSNPBinding = null;
final List<RodBinding<VariantContext>> comps = walker.getCompRodBindings();
for ( RodBinding<VariantContext> rod : comps )
dbAnnotations.put(rod, rod.getName());
final Map<RodBinding<VariantContext>, String> overlapBindings = new LinkedHashMap<>();
for ( final RodBinding<VariantContext> b : walker.getCompRodBindings())
if ( b.isBound() ) overlapBindings.put(b, b.getName());
if ( dbSNPBinding != null && ! overlapBindings.keySet().contains(VCFConstants.DBSNP_KEY) )
overlapBindings.put(dbSNPBinding, VCFConstants.DBSNP_KEY); // add overlap detection with DBSNP by default
variantOverlapAnnotator = new VariantOverlapAnnotator(dbSNPBinding, overlapBindings, engine.getGenomeLocParser());
}
public void invokeAnnotationInitializationMethods( Set<VCFHeaderLine> headerLines ) {
@ -161,14 +154,13 @@ public class VariantAnnotatorEngine {
}
public Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
Set<VCFHeaderLine> descriptions = new HashSet<VCFHeaderLine>();
for ( InfoFieldAnnotation annotation : requestedInfoAnnotations )
descriptions.addAll(annotation.getDescriptions());
for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations )
descriptions.addAll(annotation.getDescriptions());
for ( String db : dbAnnotations.values() ) {
for ( String db : variantOverlapAnnotator.getOverlapNames() ) {
if ( VCFStandardHeaderLines.getInfoLine(db, false) != null )
descriptions.add(VCFStandardHeaderLines.getInfoLine(db));
else
@ -178,10 +170,6 @@ public class VariantAnnotatorEngine {
return descriptions;
}
public void setRequireStrictAlleleMatch( final boolean requireStrictAlleleMatch ) {
this.requireStrictAlleleMatch = requireStrictAlleleMatch;
}
public VariantContext annotateContext(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
@ -192,13 +180,10 @@ public class VariantAnnotatorEngine {
public VariantContext annotateContext(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
VariantContext vc,
final VariantContext vc,
final Map<String,PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
// annotate db occurrences
vc = annotateDBs(tracker, ref.getLocus(), vc, infoAnnotations);
// annotate expressions where available
annotateExpressions(tracker, ref.getLocus(), infoAnnotations);
@ -213,11 +198,16 @@ public class VariantAnnotatorEngine {
VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations);
// annotate genotypes, creating another new VC in the process
return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make();
final VariantContext annotated = builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make();
// annotate db occurrences
return annotateDBs(tracker, annotated);
}
public VariantContext annotateContext(final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap, VariantContext vc) {
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
public VariantContext annotateContextForActiveRegion(final RefMetaDataTracker tracker,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final VariantContext vc) {
final Map<String, Object> infoAnnotations = new LinkedHashMap<>(vc.getAttributes());
// go through all the requested info annotationTypes
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
@ -231,76 +221,26 @@ public class VariantAnnotatorEngine {
}
// generate a new annotated VC
VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations);
final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations);
// annotate genotypes, creating another new VC in the process
return builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make();
final VariantContext annotated = builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make();
// annotate db occurrences
return annotateDBs(tracker, annotated);
}
/**
* Annotate the ID field and other DBs for the given Variant Context
*
* @param tracker ref meta data tracker (cannot be null)
* @param loc location of the vc
* @param vc variant context to annotate
* @return non-null annotated version of vc
*/
@Requires({"tracker != null && loc != null && vc != null"})
@Ensures("result != null")
public VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc) {
final Map<String, Object> newInfoAnnotations = new HashMap<String, Object>(0);
vc = annotateDBs(tracker, loc, vc, newInfoAnnotations);
if ( !newInfoAnnotations.isEmpty() ) {
final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(newInfoAnnotations);
vc = builder.make();
}
return vc;
}
/**
* Annotate the ID field and other DBs for the given Variant Context
*
* @param tracker ref meta data tracker (cannot be null)
* @param loc location of the vc
* @param vc variant context to annotate
* @param infoAnnotations info annotation map to populate
* @return non-null annotated version of vc
*/
@Requires({"tracker != null && loc != null && vc != null && infoAnnotations != null"})
@Ensures("result != null")
private VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc, final Map<String, Object> infoAnnotations) {
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
final String rsID = GATKVCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), loc), vc.getType());
// add the ID if appropriate
if ( rsID != null ) {
// put the DB key into the INFO field
infoAnnotations.put(VCFConstants.DBSNP_KEY, true);
if ( vc.emptyID() ) {
vc = new VariantContextBuilder(vc).id(rsID).make();
} else if ( walker.alwaysAppendDbsnpId() && vc.getID().indexOf(rsID) == -1 ) {
final String newRsID = vc.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID;
vc = new VariantContextBuilder(vc).id(newRsID).make();
}
}
} else {
boolean overlapsComp = false;
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), loc) ) {
if ( !comp.isFiltered() && ( !requireStrictAlleleMatch || comp.getAlleles().equals(vc.getAlleles()) ) ) {
overlapsComp = true;
break;
}
}
if ( overlapsComp )
infoAnnotations.put(dbSet.getValue(), overlapsComp);
}
}
return vc;
private VariantContext annotateDBs(final RefMetaDataTracker tracker, VariantContext vc) {
return variantOverlapAnnotator.annotateOverlaps(tracker, variantOverlapAnnotator.annotateRsID(tracker, vc));
}
private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map<String, Object> infoAnnotations) {

View File

@ -0,0 +1,224 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
import org.broadinstitute.variant.vcf.VCFConstants;
import java.util.*;
/**
* Annotate the ID field and attribute overlap FLAGs for a VariantContext against a RefMetaDataTracker or a list
* of VariantContexts
*/
public final class VariantOverlapAnnotator {
final RodBinding<VariantContext> dbSNPBinding;
final Map<RodBinding<VariantContext>, String> overlapBindings;
final GenomeLocParser genomeLocParser;
/**
* Create a new VariantOverlapAnnotator without overall bindings
*
* @see #VariantOverlapAnnotator(org.broadinstitute.sting.commandline.RodBinding, java.util.Map, org.broadinstitute.sting.utils.GenomeLocParser)
*/
public VariantOverlapAnnotator(RodBinding<VariantContext> dbSNPBinding, GenomeLocParser genomeLocParser) {
this(dbSNPBinding, Collections.<RodBinding<VariantContext>, String>emptyMap(), genomeLocParser);
}
/**
* Create a new VariantOverlapAnnotator
*
* @param dbSNPBinding the RodBinding to use for updating ID field values, or null if that behavior isn't desired
* @param overlapBindings a map of RodBindings / name to use for overlap annotation. Each binding will be used to
* add name => true for variants that overlap with variants found to a
* RefMetaDataTracker at each location. Can be empty but not null
* @param genomeLocParser the genome loc parser we'll use to create GenomeLocs for VariantContexts
*/
public VariantOverlapAnnotator(RodBinding<VariantContext> dbSNPBinding, Map<RodBinding<VariantContext>, String> overlapBindings, GenomeLocParser genomeLocParser) {
if ( overlapBindings == null ) throw new IllegalArgumentException("overlapBindings cannot be null");
if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
this.dbSNPBinding = dbSNPBinding;
this.overlapBindings = overlapBindings;
this.genomeLocParser = genomeLocParser;
}
/**
* Update rsID in vcToAnnotate with rsIDs from dbSNPBinding fetched from tracker
* @see #annotateOverlap(java.util.List, String, org.broadinstitute.variant.variantcontext.VariantContext)
*
* @param tracker non-null tracker, which we will use to update the rsID of vcToAnnotate
* for VariantContexts bound to dbSNPBinding that start at vcToAnnotate
* @param vcToAnnotate a variant context to annotate
* @return a VariantContext (may be == to vcToAnnotate) with updated rsID value
*/
public VariantContext annotateRsID(final RefMetaDataTracker tracker, final VariantContext vcToAnnotate) {
if ( dbSNPBinding != null ) {
final GenomeLoc loc = getLoc(vcToAnnotate);
return annotateRsID(tracker.getValues(dbSNPBinding, loc), vcToAnnotate);
} else {
return vcToAnnotate;
}
}
/**
* Update rsID of vcToAnnotate with rsID match found in vcsAtLoc, if one exists
*
* @param vcsAtLoc a list of variant contexts starting at this location to use as sources for rsID values
* @param vcToAnnotate a variant context to annotate
* @return a VariantContext (may be == to vcToAnnotate) with updated rsID value
*/
public VariantContext annotateRsID(final List<VariantContext> vcsAtLoc, final VariantContext vcToAnnotate ) {
final String rsID = getRsID(vcsAtLoc, vcToAnnotate);
// add the ID if appropriate
if ( rsID != null ) {
final VariantContextBuilder vcb = new VariantContextBuilder(vcToAnnotate);
if ( ! vcToAnnotate.hasID() ) {
return vcb.id(rsID).make();
} else if ( ! vcToAnnotate.getID().contains(rsID) ) {
return vcb.id(vcToAnnotate.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID).make();
} // falling through to return VC lower down
}
// nothing to do, just return vc
return vcToAnnotate;
}
private GenomeLoc getLoc(final VariantContext vc) {
return genomeLocParser.createGenomeLoc(vc);
}
/**
* Add overlap attributes to vcToAnnotate against all overlapBindings in tracker
*
* @see #annotateOverlap(java.util.List, String, org.broadinstitute.variant.variantcontext.VariantContext)
* for more information
*
* @param tracker non-null tracker, which we will use to update the rsID of vcToAnnotate
* for VariantContexts bound to dbSNPBinding that start at vcToAnnotate
* @param vcToAnnotate a variant context to annotate
* @return a VariantContext (may be == to vcToAnnotate) with updated overlaps update fields value
*/
public VariantContext annotateOverlaps(final RefMetaDataTracker tracker, final VariantContext vcToAnnotate) {
if ( overlapBindings.isEmpty() ) return vcToAnnotate;
VariantContext annotated = vcToAnnotate;
final GenomeLoc loc = getLoc(vcToAnnotate);
for ( final Map.Entry<RodBinding<VariantContext>, String> overlapBinding : overlapBindings.entrySet() ) {
annotated = annotateOverlap(tracker.getValues(overlapBinding.getKey(), loc), overlapBinding.getValue(), vcToAnnotate);
}
return annotated;
}
/**
* Add overlaps flag attributes to vcToAnnotate binding overlapTestVCs.getSource() => true if
* an overlapping variant context can be found in overlapTestVCs with vcToAnnotate
*
* Overlaps here means that the reference alleles are the same and at least one alt
* allele in vcToAnnotate is equals to one of the alt alleles in overlapTestVCs
*
* @param overlapTestVCs a non-null list of potential overlaps that start at vcToAnnotate
* @param attributeKey the key to set to true in the attribute map for vcToAnnotate if it overlaps
* @param vcToAnnotate a non-null VariantContext to annotate
* @return
*/
public VariantContext annotateOverlap(final List<VariantContext> overlapTestVCs, final String attributeKey, VariantContext vcToAnnotate) {
if ( overlapBindings.isEmpty() ) return vcToAnnotate;
final boolean overlaps = overlaps(overlapTestVCs, vcToAnnotate);
if ( overlaps ) {
return new VariantContextBuilder(vcToAnnotate).attribute(attributeKey, true).make();
} else {
return vcToAnnotate;
}
}
/**
* Returns the ID field of the first VariantContext in rsIDSourceVCs that has the same reference allele
* as vcToAnnotate and all of the alternative alleles in vcToAnnotate.
*
* Doesn't require vcToAnnotate to be a complete match, so
*
* A/C/G in VC in rsIDSourceVCs
*
* would match the a VC with A/C but not A/T. Also we don't require all alleles to match
* so we would also match A/C/T to A/C/G.
*
* Will only match rsIDSourceVCs that aren't failing filters.
*
* @param rsIDSourceVCs a non-null list of potential overlaps that start at vcToAnnotate
* @param vcToAnnotate a non-null VariantContext to annotate
* @return a String to use for the rsID from rsIDSourceVCs if one matches, or null if none matches
*/
private String getRsID(final List<VariantContext> rsIDSourceVCs, final VariantContext vcToAnnotate) {
if ( rsIDSourceVCs == null ) throw new IllegalArgumentException("rsIDSourceVCs cannot be null");
if ( vcToAnnotate == null ) throw new IllegalArgumentException("vcToAnnotate cannot be null");
for ( final VariantContext vcComp : rsIDSourceVCs ) {
if ( vcComp.isFiltered() ) continue; // don't process any failed VCs
if ( ! vcComp.getChr().equals(vcToAnnotate.getChr()) || vcComp.getStart() != vcToAnnotate.getStart() )
throw new IllegalArgumentException("source rsID VariantContext " + vcComp + " doesn't start at the same position as vcToAnnotate " + vcToAnnotate);
if ( vcToAnnotate.getReference().equals(vcComp.getReference()) ) {
for ( final Allele allele : vcToAnnotate.getAlternateAlleles() ) {
if ( vcComp.getAlternateAlleles().contains(allele) )
return vcComp.getID();
}
}
}
return null;
}
/**
* Does vcToAnnotate overlap with any of the records in potentialOverlaps?
*
* @param potentialOverlaps a non-null list of potential overlaps that start at vcToAnnotate
* @param vcToAnnotate a non-null VariantContext to annotate
* @return true if vcToAnnotate overlaps (position and all alt alleles) with some variant in potentialOverlaps
*/
private boolean overlaps(final List<VariantContext> potentialOverlaps, final VariantContext vcToAnnotate) {
return getRsID(potentialOverlaps, vcToAnnotate) != null;
}
/**
* Get the collection of the RodBinding names for those being used for overlap detection
* @return a non-null collection of Strings
*/
public Collection<String> getOverlapNames() {
return overlapBindings.values();
}
}

View File

@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantOverlapAnnotator;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
@ -112,24 +113,21 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
// for dealing with indels in hapmap
CloseableIterator<GATKFeature> dbsnpIterator = null;
VariantOverlapAnnotator variantOverlapAnnotator = null;
public void initialize() {
vcfwriter = VariantContextWriterFactory.sortOnTheFly(baseWriter, 40, false);
variantOverlapAnnotator = new VariantOverlapAnnotator(dbsnp.dbsnp, getToolkit().getGenomeLocParser());
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) )
return 0;
String rsID = dbsnp == null ? null : GATKVCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP);
Collection<VariantContext> contexts = getVariantContexts(tracker, ref);
for ( VariantContext vc : contexts ) {
VariantContextBuilder builder = new VariantContextBuilder(vc);
if ( rsID != null && vc.emptyID() ) {
builder.id(rsID).make();
}
// set the appropriate sample name if necessary
if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) {
@ -137,7 +135,8 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
builder.genotypes(g);
}
writeRecord(builder.make(), tracker, ref.getLocus());
final VariantContext withID = variantOverlapAnnotator.annotateRsID(tracker, builder.make());
writeRecord(withID, tracker, ref.getLocus());
}
return 1;

View File

@ -149,21 +149,6 @@ public class GATKVCFUtils {
return VCFUtils.withUpdatedContigs(header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary());
}
public static String rsIDOfFirstRealVariant(List<VariantContext> VCs, VariantContext.Type type) {
if ( VCs == null )
return null;
String rsID = null;
for ( VariantContext vc : VCs ) {
if ( vc.getType() == type ) {
rsID = vc.getID();
break;
}
}
return rsID;
}
/**
* Utility class to read all of the VC records from a file
*

View File

@ -256,7 +256,6 @@ public class ReadMetricsUnitTest extends BaseTest {
}
windowMaker.close();
}
traverseActiveRegions.endTraversal(walker, 0);
Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig);
Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig);

View File

@ -49,7 +49,7 @@ import java.util.*;
/**
* @author depristo
*/
public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest {
public class IntervalReferenceOrderedViewUnitTest extends BaseTest {
private static int startingChr = 1;
private static int endingChr = 2;
private static int readCount = 100;
@ -285,7 +285,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest {
Collections.sort(intervals);
final GenomeLoc span = span(intervals);
final ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(genomeLocParser, span, names, iterators);
final IntervalReferenceOrderedView view = new IntervalReferenceOrderedView(genomeLocParser, span, names, iterators);
if ( testStateless ) {
// test each tracker is well formed, as each is created

View File

@ -97,7 +97,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.<ReferenceOrderedDataSource>emptyList());
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",10), null);
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",10));
Assert.assertEquals(tracker.getValues(Feature.class).size(), 0, "The tracker should not have produced any data");
}
@ -115,7 +115,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.singletonList(dataSource));
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null);
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20));
TableFeature datum = tracker.getFirstValue(new RodBinding<TableFeature>(TableFeature.class, "tableTest"));
Assert.assertEquals(datum.get("COL1"),"C","datum parameter for COL1 is incorrect");
@ -141,7 +141,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Arrays.asList(dataSource1,dataSource2));
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null);
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20));
TableFeature datum1 = tracker.getFirstValue(new RodBinding<TableFeature>(TableFeature.class, "tableTest1"));
Assert.assertEquals(datum1.get("COL1"),"C","datum1 parameter for COL1 is incorrect");

View File

@ -405,8 +405,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam))
t.traverse(walker, dataProvider, 0);
t.endTraversal(walker, 0);
return walker.mappedActiveRegions;
}

View File

@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.qc.CountReads;
@ -47,6 +48,7 @@ import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static org.testng.Assert.fail;
@ -146,7 +148,7 @@ public class TraverseReadsUnitTest extends BaseTest {
fail("Shard == null");
}
ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null);
ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null, Collections.<ReferenceOrderedDataSource>emptyList());
accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator);
dataProvider.close();
}