diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index cbcba28fd..04173b64f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; @@ -146,6 +147,7 @@ public class GenotypingEngine { final GenomeLoc refLoc, final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser, + final RefMetaDataTracker tracker, final List activeAllelesToGenotype ) { // sanity check input arguments if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); @@ -204,7 +206,7 @@ public class GenotypingEngine { convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) ); final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); - VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call); + VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call); if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 182e59493..e55413649 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -441,7 +441,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private final static int MIN_READ_LENGTH = 10; private List samplesList = new ArrayList(); - private final List allelesToGenotype = new ArrayList(); private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file @@ -596,7 +595,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles); if( vcFromAllelesRod != null ) { - allelesToGenotype.add(vcFromAllelesRod); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object return new ActivityProfileState(ref.getLocus(), 1.0); } } @@ -664,12 +662,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final List activeAllelesToGenotype = new ArrayList<>(); if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - for( final VariantContext vc : allelesToGenotype ) { - if( originalActiveRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) { + for ( final VariantContext vc : metaDataTracker.getValues(UG_engine.getUAC().alleles) ) { + if ( vc.isNotFiltered() ) { activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode } } - allelesToGenotype.removeAll( activeAllelesToGenotype ); // No alleles found in this region so nothing to do! if ( activeAllelesToGenotype.isEmpty() ) { return NO_CALLS; } } else { @@ -704,6 +701,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In assemblyResult.paddedReferenceLoc, assemblyResult.regionForGenotyping.getLocation(), getToolkit().getGenomeLocParser(), + metaDataTracker, activeAllelesToGenotype ); // TODO -- must disable if we are doing NCT, or set the output type of ! presorted @@ -890,8 +888,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Override public Integer reduce(List callsInRegion, Integer numCalledRegions) { for( final VariantContext call : callsInRegion ) { - // TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker. - // annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call); vcfWriter.add( call ); } return (callsInRegion.isEmpty() ? 0 : 1) + numCalledRegions; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java new file mode 100644 index 000000000..6d6761f1c --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java @@ -0,0 +1,164 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +public class VariantOverlapAnnotatorUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + private IndexedFastaSequenceFile seq; + + @BeforeClass + public void setup() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(seq); + } + + private VariantContext makeVC(final String source, final String id, final List alleles) { + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles(source, "20", 10, alleles); + return new VariantContextBuilder(vc).id(id).make(); + } + + private VariantOverlapAnnotator makeAnnotator(final String dbSNP, final String ... overlaps) { + final RodBinding dbSNPBinding = dbSNP == null ? null : new RodBinding<>(VariantContext.class, dbSNP); + final Map, String> overlapBinding = new LinkedHashMap<>(); + for ( final String overlap : overlaps ) overlapBinding.put(new RodBinding<>(VariantContext.class, overlap), overlap); + return new VariantOverlapAnnotator(dbSNPBinding, overlapBinding, genomeLocParser); + } + + @Test + public void testCreateWithSpecialNames() { + final List names = Arrays.asList("X", "Y", "Z"); + final Map, String> overlapBinding = new LinkedHashMap<>(); + for ( final String overlap : names ) overlapBinding.put(new RodBinding<>(VariantContext.class, overlap + "Binding"), overlap); + final VariantOverlapAnnotator annotator = new VariantOverlapAnnotator(null, overlapBinding, genomeLocParser); + Assert.assertEquals(annotator.getOverlapNames(), names); + } + + @DataProvider(name = "AnnotateRsIDData") + public Object[][] makeAnnotateRsIDData() { + List tests = new ArrayList<>(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final VariantContext callNoIDAC = makeVC("call", VCFConstants.EMPTY_ID_FIELD, Arrays.asList("A", "C")); + final VariantContext callNoIDAT = makeVC("call", VCFConstants.EMPTY_ID_FIELD, Arrays.asList("A", "T")); + final VariantContext callIDAC = makeVC("call", "foo", Arrays.asList("A", "C")); + final VariantContext callExistingIDAC = makeVC("call", "rsID1", Arrays.asList("A", "C")); + + final VariantContext dbSNP_AC = makeVC("DBSNP", "rsID1", Arrays.asList("A", "C")); + final VariantContext dbSNP_AT = makeVC("DBSNP", "rsID2", Arrays.asList("A", "T")); + final VariantContext dbSNP_AG = makeVC("DBSNP", "rsID3", Arrays.asList("A", "G")); + final VariantContext dbSNP_AC_AT = makeVC("DBSNP", "rsID1;rsID2", Arrays.asList("A", "C", "T")); + final VariantContext dbSNP_AC_AG = makeVC("DBSNP", "rsID1;rsID3", Arrays.asList("A", "C", "G")); + + tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC), dbSNP_AC.getID(), true}); + tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AT), VCFConstants.EMPTY_ID_FIELD, false}); + tests.add(new Object[]{callIDAC, Arrays.asList(dbSNP_AC), "foo" + ";" + dbSNP_AC.getID(), true}); + tests.add(new Object[]{callIDAC, Arrays.asList(dbSNP_AT), "foo", false}); + tests.add(new Object[]{callExistingIDAC, Arrays.asList(dbSNP_AC), "rsID1", true}); + tests.add(new Object[]{callExistingIDAC, Arrays.asList(dbSNP_AT), "rsID1", false}); + + final VariantContext callNoIDACT = makeVC("call", VCFConstants.EMPTY_ID_FIELD, Arrays.asList("A", "C", "T")); + tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AC), dbSNP_AC.getID(), true}); + tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AT), dbSNP_AT.getID(), true}); + tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AG), VCFConstants.EMPTY_ID_FIELD, false}); + tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AC_AT), dbSNP_AC_AT.getID(), true}); + tests.add(new Object[]{callNoIDACT, Arrays.asList(dbSNP_AC_AG), dbSNP_AC_AG.getID(), true}); + + // multiple options + tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC, dbSNP_AT), "rsID1", true}); + tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AT, dbSNP_AC), "rsID1", true}); + tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC_AT), "rsID1;rsID2", true}); + tests.add(new Object[]{callNoIDAT, Arrays.asList(dbSNP_AC_AT), "rsID1;rsID2", true}); + tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC_AG), "rsID1;rsID3", true}); + tests.add(new Object[]{callNoIDAT, Arrays.asList(dbSNP_AC_AG), VCFConstants.EMPTY_ID_FIELD, false}); + + final VariantContext dbSNP_AC_FAIL = new VariantContextBuilder(makeVC("DBSNP", "rsID1", Arrays.asList("A", "C"))).filter("FAIL").make(); + tests.add(new Object[]{callNoIDAC, Arrays.asList(dbSNP_AC_FAIL), VCFConstants.EMPTY_ID_FIELD, false}); + + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AnnotateRsIDData") + public void testAnnotateRsID(final VariantContext toAnnotate, final List dbSNPRecords, final String expectedID, final boolean expectOverlap) throws Exception { + final VariantOverlapAnnotator annotator = makeAnnotator("dbnsp"); + final VariantContext annotated = annotator.annotateRsID(dbSNPRecords, toAnnotate); + Assert.assertNotNull(annotated); + Assert.assertEquals(annotated.getID(), expectedID); + } + + @Test(dataProvider = "AnnotateRsIDData") + public void testAnnotateOverlaps(final VariantContext toAnnotate, final List records, final String expectedID, final boolean expectOverlap) throws Exception { + final String name = "binding"; + final VariantOverlapAnnotator annotator = makeAnnotator(null, name); + final VariantContext annotated = annotator.annotateOverlap(records, name, toAnnotate); + Assert.assertNotNull(annotated); + Assert.assertEquals(annotated.getID(), toAnnotate.getID(), "Shouldn't modify annotation"); + Assert.assertEquals(annotated.hasAttribute(name), expectOverlap); + if ( expectOverlap ) { + Assert.assertEquals(annotated.getAttribute(name), true); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 856e97ebe..98a482c6f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("19f77f557150905ef3fa4713f611a1b9")); + Arrays.asList("14ad6eeed46e9b6f4757370267b1a1cc")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("bb3dbad9666ebf38d338f0c9c211a42e")); + Arrays.asList("cd184a2a5a1932dcf3e8f0424652176b")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("8052390ca2b6a57c3ddf379a51225d64")); + Arrays.asList("e8d98996eb81ece8cfb52437920ae2e0")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("b6b9dba97fbabaeeb458a41051983e7b")); + Arrays.asList("23a78c16f64bffe1dea3a5587fcabdad")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("38730c7030271f5d0ca0b59365d57814")); + Arrays.asList("294183823d678d3668f4fa98b4de6e06")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("264325878b988acc11d8e5d9d2ba0b7f")); + Arrays.asList("e90256acfc360fc4bf377094732a673a")); executeTest("test minIndelFraction 0.0", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index a52176a08..bf4316415 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -80,7 +80,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("698e54aeae3130779d246b9480a4052c")); + Arrays.asList("60115af273fde49c76d4df6c9c0f6501")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 77be9fba2..904f15728 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -47,15 +47,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broad.tribble.TribbleIndexedFeatureReader; import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; import org.testng.annotations.Test; import java.io.File; @@ -69,6 +66,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; + final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam"; final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; @@ -199,4 +197,27 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { Arrays.asList("86bdd07a3ac4f6ce239c30efea8bf5ba")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // test dbSNP annotation + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestDBSNPAnnotationWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("7b23a288a31cafca3946f14f2381e7cb")); + executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); + } + + @Test + public void HCTestDBSNPAnnotationWEx() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + + " -L " + hg19Intervals + " -isr INTERSECTION", 1, + Arrays.asList("9587029b702bb59bd4dfec69eac4c210")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java index fe3a0c6ce..3aff745fa 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java @@ -72,8 +72,6 @@ class IntervalOverlappingRODsFromStream { /** * Get the list of RODs overlapping loc from this stream of RODs. * - * Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart - * * @param loc the interval to query * @return a non-null RODRecordList containing the overlapping RODs, which may be empty */ @@ -84,7 +82,6 @@ class IntervalOverlappingRODsFromStream { if ( lastQuery != null && loc.getStart() < lastQuery.getStart() ) throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery)); - trimCurrentFeaturesToLoc(loc); readOverlappingFutureFeatures(loc); return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc); } @@ -128,11 +125,14 @@ class IntervalOverlappingRODsFromStream { /** * Update function. Remove all elements of currentFeatures that end before loc * + * Must be called by clients periodically when they know they they will never ask for data before + * loc, so that the running cache of RODs doesn't grow out of control. + * * @param loc the location to use */ @Requires("loc != null") @Ensures("currentFeatures.size() <= old(currentFeatures.size())") - private void trimCurrentFeaturesToLoc(final GenomeLoc loc) { + public void trimCurrentFeaturesToLoc(final GenomeLoc loc) { final ListIterator it = currentFeatures.listIterator(); while ( it.hasNext() ) { final GATKFeature feature = it.next(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java new file mode 100644 index 000000000..5e884ce53 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java @@ -0,0 +1,184 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.datasources.providers; + +import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * a ROD view that allows for requests for RODs that overlap intervals on the genome to produce a RefMetaDataTracker + */ +public class IntervalReferenceOrderedView implements ReferenceOrderedView { + /** a list of the RMDDataState (location->iterators) */ + private final List states = new ArrayList<>(1); + + /** + * Used to get genome locs for reads + */ + protected final GenomeLocParser genomeLocParser; + + /** + * The total extent of all reads in this span. We create iterators from our RODs + * from the start of this span, to the end. + */ + private final GenomeLoc shardSpan; + + /** + * Create a new IntervalReferenceOrderedView taking data from provider and capable of + * servicing ROD overlap requests within the genomic interval span + * + * @param provider a ShardDataProvider to give us data + * @param span a GenomeLoc span, or null indicating take the entire genome + */ + public IntervalReferenceOrderedView(final ShardDataProvider provider, final GenomeLoc span) { + if ( provider == null ) throw new IllegalArgumentException("provider cannot be null"); + if ( provider.hasReferenceOrderedData() && span == null ) throw new IllegalArgumentException("span cannot be null when provider has reference ordered data"); + + this.genomeLocParser = provider.getGenomeLocParser(); + this.shardSpan = span; + provider.register(this); + + // conditional to optimize the case where we don't have any ROD data + if ( provider.hasReferenceOrderedData() && ! shardSpan.isUnmapped() ) { + for (final ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) + states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan))); + } + } + + /** + * Testing constructor + */ + protected IntervalReferenceOrderedView(final GenomeLocParser genomeLocParser, + final GenomeLoc shardSpan, + final List names, + final List> featureSources) { + this.genomeLocParser = genomeLocParser; + this.shardSpan = shardSpan; + for ( int i = 0; i < names.size(); i++ ) + states.add(new RMDDataState(names.get(i), featureSources.get(i))); + } + + public Collection> getConflictingViews() { + List> classes = new ArrayList<>(); + classes.add(ManagingReferenceOrderedView.class); + return classes; + } + + /** + * Get a RefMetaDataTracker containing bindings for all RODs overlapping the start position of loc + * @param loc a GenomeLoc of size == 1 + * @return a non-null RefMetaDataTracker + */ + @Override + public RefMetaDataTracker getReferenceOrderedDataAtLocus(GenomeLoc loc) { + if ( loc == null ) throw new IllegalArgumentException("loc cannot be null"); + if ( loc.size() != 1 ) throw new IllegalArgumentException("GenomeLoc must have size == 1 but got " + loc); + return getReferenceOrderedDataForInterval(loc); + } + + /** + * Get a RefMetaDataTracker containing bindings for all RODs overlapping interval + * + * @param interval a non=null interval + * @return a non-null RefMetaDataTracker + */ + public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { + if ( interval == null ) throw new IllegalArgumentException("Interval cannot be null"); + + if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers) + return RefMetaDataTracker.EMPTY_TRACKER; + else { + final List bindings = new ArrayList<>(states.size()); + for ( final RMDDataState state : states ) + bindings.add(state.stream.getOverlapping(interval)); + return new RefMetaDataTracker(bindings); + } + } + + /** + * Trim down all of the ROD managers so that they only hold ROD bindings wit start >= startOfDataToKeep.getStart() + * + * @param startOfDataToKeep a non-null genome loc + */ + public void trimCurrentFeaturesToLoc(final GenomeLoc startOfDataToKeep) { + if ( startOfDataToKeep == null ) throw new IllegalArgumentException("startOfDataToKeep cannot be null"); + + for ( final RMDDataState state : states ) + state.stream.trimCurrentFeaturesToLoc(startOfDataToKeep); + } + + /** + * Closes the current view. + */ + public void close() { + for (final RMDDataState state : states) + state.close(); + + // Clear out the existing data so that post-close() accesses to this data will fail-fast. + states.clear(); + } + + /** + * Models the traversal state of a given ROD lane. + */ + private static class RMDDataState { + public final ReferenceOrderedDataSource dataSource; + public final IntervalOverlappingRODsFromStream stream; + private final LocationAwareSeekableRODIterator iterator; + + public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { + this.dataSource = dataSource; + this.iterator = iterator; + this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<>(iterator)); + } + + /** + * For testing + */ + public RMDDataState(final String name, final PeekableIterator iterator) { + this.dataSource = null; + this.iterator = null; + this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<>(iterator)); + } + + public void close() { + if ( dataSource != null ) + dataSource.close( iterator ); + } + } +} + diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java index 09b72f5eb..50f2369cb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java @@ -76,7 +76,8 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView { * @param loc Locus at which to track. * @return A tracker containing information about this locus. */ - public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) { + @Override + public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { if ( states.isEmpty() ) return RefMetaDataTracker.EMPTY_TRACKER; else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java index 52f490972..84e27c953 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java @@ -42,52 +42,9 @@ import java.util.Collection; import java.util.List; /** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */ -public class ReadBasedReferenceOrderedView implements View { - // a list of the RMDDataState (location->iterators) - private final List states = new ArrayList(1); - private final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); - - /** - * Used to get genome locs for reads - */ - private final GenomeLocParser genomeLocParser; - - /** - * The total extent of all reads in this span. We create iterators from our RODs - * from the start of this span, to the end. - */ - private final GenomeLoc shardSpan; - +public class ReadBasedReferenceOrderedView extends IntervalReferenceOrderedView { public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { - this.genomeLocParser = provider.getGenomeLocParser(); - // conditional to optimize the case where we don't have any ROD data - this.shardSpan = provider.getReferenceOrderedData() != null ? ((ReadShard)provider.getShard()).getReadsSpan() : null; - provider.register(this); - - if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) { - for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) - states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan))); - } - } - - - /** - * Testing constructor - */ - protected ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser, - final GenomeLoc shardSpan, - final List names, - final List> featureSources) { - this.genomeLocParser = genomeLocParser; - this.shardSpan = shardSpan; - for ( int i = 0; i < names.size(); i++ ) - states.add(new RMDDataState(names.get(i), featureSources.get(i))); - } - - public Collection> getConflictingViews() { - List> classes = new ArrayList>(); - classes.add(ManagingReferenceOrderedView.class); - return classes; + super(provider, provider.hasReferenceOrderedData() ? ((ReadShard)provider.getShard()).getReadsSpan() : null); } /** @@ -101,60 +58,11 @@ public class ReadBasedReferenceOrderedView implements View { @Ensures("result != null") public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { if ( rec.getReadUnmappedFlag() ) - // empty RODs for unmapped reads - return new RefMetaDataTracker(); - else - return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec)); - } - - @Requires({"interval != null", "shardSpan == null || shardSpan.isUnmapped() || shardSpan.containsP(interval)"}) - @Ensures("result != null") - public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { - if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers) - return EMPTY_TRACKER; + return RefMetaDataTracker.EMPTY_TRACKER; else { - final List bindings = new ArrayList(states.size()); - for ( final RMDDataState state : states ) - bindings.add(state.stream.getOverlapping(interval)); - return new RefMetaDataTracker(bindings); - } - } - - /** - * Closes the current view. - */ - public void close() { - for (final RMDDataState state : states) - state.close(); - - // Clear out the existing data so that post-close() accesses to this data will fail-fast. - states.clear(); - } - - /** Models the traversal state of a given ROD lane. */ - private static class RMDDataState { - public final ReferenceOrderedDataSource dataSource; - public final IntervalOverlappingRODsFromStream stream; - private final LocationAwareSeekableRODIterator iterator; - - public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { - this.dataSource = dataSource; - this.iterator = iterator; - this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator(iterator)); - } - - /** - * For testing - */ - public RMDDataState(final String name, final PeekableIterator iterator) { - this.dataSource = null; - this.iterator = null; - this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator(iterator)); - } - - public void close() { - if ( dataSource != null ) - dataSource.close( iterator ); + final GenomeLoc readSpan = genomeLocParser.createGenomeLoc(rec); + trimCurrentFeaturesToLoc(readSpan); + return getReferenceOrderedDataForInterval(readSpan); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java index fa83dff82..85c20a6c3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java @@ -25,10 +25,9 @@ package org.broadinstitute.sting.gatk.datasources.providers; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; public interface ReferenceOrderedView extends View { - RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext refContext ); + RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java index 3fb4c7352..1b6c14628 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java @@ -98,7 +98,8 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { rodQueue = new RODMergingIterator(iterators); } - public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) { + @Override + public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { // special case the interval again -- add it into the ROD if ( interval != null ) { allTracksHere.add(interval); } return new RefMetaDataTracker(allTracksHere); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 415049228..dc46849df 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -37,7 +37,6 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; -import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; @@ -114,12 +113,6 @@ public class LinearMicroScheduler extends MicroScheduler { done = walker.isDone(); } - // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine - if( traversalEngine instanceof TraverseActiveRegions) { - final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); - accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator - } - Object result = accumulator.finishTraversal(); outputTracker.close(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index cac93cb07..b85365366 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -29,14 +29,12 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionTraversalParameters; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; @@ -92,12 +90,26 @@ public final class TraverseActiveRegions extends TraversalEngine walker; - final NanoScheduler nanoScheduler; + final NanoScheduler nanoScheduler; + + /** + * Data to use in the ActiveRegionWalker.map function produced by the NanoScheduler input iterator + */ + private static class MapData { + public ActiveRegion activeRegion; + public RefMetaDataTracker tracker; + + private MapData(ActiveRegion activeRegion, RefMetaDataTracker tracker) { + this.activeRegion = activeRegion; + this.tracker = tracker; + } + } /** * Create a single threaded active region traverser @@ -112,12 +124,12 @@ public final class TraverseActiveRegions extends TraversalEngine(nThreads); - nanoScheduler.setProgressFunction(new NSProgressFunction() { + nanoScheduler.setProgressFunction(new NSProgressFunction() { @Override - public void progress(ActiveRegion lastActiveRegion) { + public void progress(MapData lastActiveRegion) { if ( lastActiveRegion != null ) // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon - printProgress(lastActiveRegion.getLocation().getStopLocation()); + printProgress(lastActiveRegion.activeRegion.getLocation().getStopLocation()); } }); } @@ -149,13 +161,6 @@ public final class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine walker, - final LocusShardDataProvider dataProvider, - final LocusView locusView) { - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - return new ManagingReferenceOrderedView( dataProvider ); - else - return (RodLocusView)locusView; - } - - // ------------------------------------------------------------------------------------- // // Actual traverse function @@ -254,7 +267,7 @@ public final class TraverseActiveRegions extends TraversalEngine activeRegionIterator = new ActiveRegionIterator(dataProvider); + final Iterator activeRegionIterator = new ActiveRegionIterator(dataProvider); final TraverseActiveRegionMap myMap = new TraverseActiveRegionMap(); final TraverseActiveRegionReduce myReduce = new TraverseActiveRegionReduce(); final T result = nanoScheduler.execute(activeRegionIterator, myMap, sum, myReduce); @@ -262,29 +275,53 @@ public final class TraverseActiveRegions extends TraversalEngine { + private class ActiveRegionIterator implements Iterator { private final LocusShardDataProvider dataProvider; - private LinkedList readyActiveRegions = new LinkedList(); + private LinkedList readyActiveRegions = new LinkedList<>(); private boolean done = false; private final LocusView locusView; private final LocusReferenceView referenceView; - private final ReferenceOrderedView referenceOrderedDataView; private final GenomeLoc locOfLastReadAtTraversalStart; + private final IntervalReferenceOrderedView referenceOrderedDataView; + private final GenomeLoc currentWindow; + private final boolean processRemainingActiveRegions; public ActiveRegionIterator( final LocusShardDataProvider dataProvider ) { this.dataProvider = dataProvider; locusView = new AllLocusView(dataProvider); referenceView = new LocusReferenceView( walker, dataProvider ); - referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + + // The data shard may carry a number of locations to process (due to being indexed together). + // This value is just the interval we are processing within the entire provider + currentWindow = dataProvider.getLocus(); + final int currentWindowPos = dataProvider.getShard().getGenomeLocs().indexOf(currentWindow); + if ( currentWindowPos == -1 ) throw new IllegalStateException("Data provider " + dataProvider + " didn't have our current window in it " + currentWindow); + processRemainingActiveRegions = currentWindowPos == dataProvider.getShard().getGenomeLocs().size() - 1; + + // the rodSpan covers all of the bases in the activity profile, including all of the bases + // through the current window interval. This is because we may issue a query to get data for an + // active region spanning before the current interval as far back as the start of the current profile, + // if we have pending work to do that finalizes in this interval. + final GenomeLoc rodSpan = activityProfile.getSpan() == null ? currentWindow : activityProfile.getSpan().endpointSpan(currentWindow); + if ( ! dataProvider.getShard().getLocation().containsP(rodSpan) ) throw new IllegalStateException("Rod span " + rodSpan + " isn't contained within the data shard " + dataProvider.getShard().getLocation() + ", meaning we wouldn't get all of the data we need"); + referenceOrderedDataView = new IntervalReferenceOrderedView( dataProvider, rodSpan ); // We keep processing while the next reference location is within the interval locOfLastReadAtTraversalStart = spanOfLastSeenRead(); + + // load in the workQueue the present regions that span the current contig, if it's different from the last one + if ( walkerHasPresetRegions && ( lastRegionProcessed == null || ! currentWindow.onSameContig(lastRegionProcessed)) ) { + loadPresetRegionsForContigToWorkQueue(currentWindow.getContig()); + } + + // remember the last region we processed for sanity checking later + lastRegionProcessed = currentWindow; } @Override public void remove() { throw new UnsupportedOperationException("Cannot remove from ActiveRegionIterator"); } @Override - public ActiveRegion next() { + public MapData next() { return readyActiveRegions.pop(); } @Override @@ -326,7 +363,7 @@ public final class TraverseActiveRegions extends TraversalEngine newActiveRegions = prepActiveRegionsForProcessing(walker, flushProfile, false); + final List newActiveRegions = prepActiveRegionsForProcessing(walker, flushProfile, false, referenceOrderedDataView); dataProvider.getShard().getReadMetrics().incrementNumIterations(); @@ -335,7 +372,7 @@ public final class TraverseActiveRegions extends TraversalEngine extends TraversalEngine walker, T sum) { - for ( final ActiveRegion region : prepActiveRegionsForProcessing((ActiveRegionWalker)walker, true, true) ) { - final M x = ((ActiveRegionWalker) walker).map(region, null); - sum = walker.reduce( x, sum ); - } - return sum; - } - // ------------------------------------------------------------------------------------- // // Functions to manage and interact with the live / dead zone @@ -594,7 +627,10 @@ public final class TraverseActiveRegions extends TraversalEngine prepActiveRegionsForProcessing(final ActiveRegionWalker walker, final boolean flushActivityProfile, final boolean forceAllRegionsToBeActive) { + private List prepActiveRegionsForProcessing(final ActiveRegionWalker walker, + final boolean flushActivityProfile, + final boolean forceAllRegionsToBeActive, + final IntervalReferenceOrderedView referenceOrderedDataView) { if ( ! walkerHasPresetRegions ) { // We don't have preset regions, so we get our regions from the activity profile final Collection activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMinRegionSize(), getMaxRegionSize(), flushActivityProfile); @@ -603,13 +639,13 @@ public final class TraverseActiveRegions extends TraversalEngine readyRegions = new LinkedList(); + final LinkedList readyRegions = new LinkedList<>(); while( workQueue.peek() != null ) { final ActiveRegion activeRegion = workQueue.peek(); if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) { writeActivityProfile(activeRegion.getSupportingStates()); writeActiveRegion(activeRegion); - readyRegions.add(prepActiveRegionForProcessing(workQueue.remove(), walker)); + readyRegions.add(prepActiveRegionForProcessing(workQueue.remove(), walker, referenceOrderedDataView)); } else { break; } @@ -619,8 +655,10 @@ public final class TraverseActiveRegions extends TraversalEngine walker) { - final List stillLive = new LinkedList(); + private MapData prepActiveRegionForProcessing(final ActiveRegion activeRegion, + final ActiveRegionWalker walker, + final IntervalReferenceOrderedView referenceOrderedDataView) { + final List stillLive = new LinkedList<>(); for ( final GATKSAMRecord read : myReads.popCurrentReads() ) { boolean killed = false; final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); @@ -653,14 +691,21 @@ public final class TraverseActiveRegions extends TraversalEngine { + private class TraverseActiveRegionMap implements NSMapFunction { @Override - public M apply(final ActiveRegion activeRegion) { - if ( DEBUG ) logger.info("Executing walker.map for " + activeRegion + " in thread " + Thread.currentThread().getName()); - return walker.map(activeRegion, null); + public M apply(final MapData mapData) { + if ( DEBUG ) logger.info("Executing walker.map for " + mapData.activeRegion + " in thread " + Thread.currentThread().getName()); + return walker.map(mapData.activeRegion, mapData.tracker); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java index 8e67963c1..627f98d69 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -179,7 +179,7 @@ public class TraverseLociNano extends TraversalEngine, final ReferenceContext refContext = referenceView.getReferenceContext(location); // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext); + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location); numIterations++; return new MapData(locus, refContext, tracker); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index f2bd6c14c..10ba4ca17 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -180,9 +180,6 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality threshold in order to annotate mendelian violation ratio") public double minGenotypeQualityP = 0.0; - @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp tracks that exactly match both reference and alternate alleles will be counted as concordant", required=false) - protected boolean requireStrictAlleleMatch = false; - private VariantAnnotatorEngine engine; /** @@ -204,7 +201,6 @@ public class VariantAnnotator extends RodWalker implements Ann else engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit()); engine.initializeExpressions(expressionsToUse); - engine.setRequireStrictAlleleMatch(requireStrictAlleleMatch); // setup the header fields // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 695868bb1..078a36dd9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -34,26 +34,23 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.*; import java.util.*; public class VariantAnnotatorEngine { - private List requestedInfoAnnotations = Collections.emptyList(); private List requestedGenotypeAnnotations = Collections.emptyList(); - private List requestedExpressions = new ArrayList(); + private List requestedExpressions = new ArrayList<>(); - private final HashMap, String> dbAnnotations = new HashMap, String>(); private final AnnotatorCompatible walker; private final GenomeAnalysisEngine toolkit; - private boolean requireStrictAlleleMatch = false; + VariantOverlapAnnotator variantOverlapAnnotator = null; protected static class VAExpression { @@ -85,7 +82,7 @@ public class VariantAnnotatorEngine { requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations(); requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations(); excludeAnnotations(annotationsToExclude); - initializeDBs(); + initializeDBs(toolkit); } // use this constructor if you want to select specific annotations (and/or interfaces) @@ -93,14 +90,7 @@ public class VariantAnnotatorEngine { this.walker = walker; this.toolkit = toolkit; initializeAnnotations(annotationGroupsToUse, annotationsToUse, annotationsToExclude); - initializeDBs(); - } - - // experimental constructor for active region traversal - public VariantAnnotatorEngine(GenomeAnalysisEngine toolkit) { - this.walker = null; - this.toolkit = toolkit; - requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(Arrays.asList("ActiveRegionBasedAnnotation"), Collections.emptyList()); + initializeDBs(toolkit); } // select specific expressions to use @@ -138,16 +128,19 @@ public class VariantAnnotatorEngine { requestedGenotypeAnnotations = tempRequestedGenotypeAnnotations; } - private void initializeDBs() { - + private void initializeDBs(final GenomeAnalysisEngine engine) { // check to see whether comp rods were included - final RodBinding dbsnp = walker.getDbsnpRodBinding(); - if ( dbsnp != null && dbsnp.isBound() ) - dbAnnotations.put(dbsnp, VCFConstants.DBSNP_KEY); + RodBinding dbSNPBinding = walker.getDbsnpRodBinding(); + if ( dbSNPBinding != null && ! dbSNPBinding.isBound() ) + dbSNPBinding = null; - final List> comps = walker.getCompRodBindings(); - for ( RodBinding rod : comps ) - dbAnnotations.put(rod, rod.getName()); + final Map, String> overlapBindings = new LinkedHashMap<>(); + for ( final RodBinding b : walker.getCompRodBindings()) + if ( b.isBound() ) overlapBindings.put(b, b.getName()); + if ( dbSNPBinding != null && ! overlapBindings.keySet().contains(VCFConstants.DBSNP_KEY) ) + overlapBindings.put(dbSNPBinding, VCFConstants.DBSNP_KEY); // add overlap detection with DBSNP by default + + variantOverlapAnnotator = new VariantOverlapAnnotator(dbSNPBinding, overlapBindings, engine.getGenomeLocParser()); } public void invokeAnnotationInitializationMethods( Set headerLines ) { @@ -161,14 +154,13 @@ public class VariantAnnotatorEngine { } public Set getVCFAnnotationDescriptions() { - Set descriptions = new HashSet(); for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) descriptions.addAll(annotation.getDescriptions()); for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) descriptions.addAll(annotation.getDescriptions()); - for ( String db : dbAnnotations.values() ) { + for ( String db : variantOverlapAnnotator.getOverlapNames() ) { if ( VCFStandardHeaderLines.getInfoLine(db, false) != null ) descriptions.add(VCFStandardHeaderLines.getInfoLine(db)); else @@ -178,10 +170,6 @@ public class VariantAnnotatorEngine { return descriptions; } - public void setRequireStrictAlleleMatch( final boolean requireStrictAlleleMatch ) { - this.requireStrictAlleleMatch = requireStrictAlleleMatch; - } - public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, @@ -192,13 +180,10 @@ public class VariantAnnotatorEngine { public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, - VariantContext vc, + final VariantContext vc, final Map perReadAlleleLikelihoodMap) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); - // annotate db occurrences - vc = annotateDBs(tracker, ref.getLocus(), vc, infoAnnotations); - // annotate expressions where available annotateExpressions(tracker, ref.getLocus(), infoAnnotations); @@ -213,11 +198,16 @@ public class VariantAnnotatorEngine { VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process - return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make(); + final VariantContext annotated = builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make(); + + // annotate db occurrences + return annotateDBs(tracker, annotated); } - public VariantContext annotateContext(final Map perReadAlleleLikelihoodMap, VariantContext vc) { - Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); + public VariantContext annotateContextForActiveRegion(final RefMetaDataTracker tracker, + final Map perReadAlleleLikelihoodMap, + final VariantContext vc) { + final Map infoAnnotations = new LinkedHashMap<>(vc.getAttributes()); // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { @@ -231,76 +221,26 @@ public class VariantAnnotatorEngine { } // generate a new annotated VC - VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); + final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process - return builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make(); + final VariantContext annotated = builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make(); + + // annotate db occurrences + return annotateDBs(tracker, annotated); } /** * Annotate the ID field and other DBs for the given Variant Context * * @param tracker ref meta data tracker (cannot be null) - * @param loc location of the vc * @param vc variant context to annotate * @return non-null annotated version of vc */ - @Requires({"tracker != null && loc != null && vc != null"}) - @Ensures("result != null") - public VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc) { - final Map newInfoAnnotations = new HashMap(0); - vc = annotateDBs(tracker, loc, vc, newInfoAnnotations); - - if ( !newInfoAnnotations.isEmpty() ) { - final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(newInfoAnnotations); - vc = builder.make(); - } - - return vc; - } - - /** - * Annotate the ID field and other DBs for the given Variant Context - * - * @param tracker ref meta data tracker (cannot be null) - * @param loc location of the vc - * @param vc variant context to annotate - * @param infoAnnotations info annotation map to populate - * @return non-null annotated version of vc - */ @Requires({"tracker != null && loc != null && vc != null && infoAnnotations != null"}) @Ensures("result != null") - private VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc, final Map infoAnnotations) { - for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { - if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { - final String rsID = GATKVCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), loc), vc.getType()); - - // add the ID if appropriate - if ( rsID != null ) { - // put the DB key into the INFO field - infoAnnotations.put(VCFConstants.DBSNP_KEY, true); - - if ( vc.emptyID() ) { - vc = new VariantContextBuilder(vc).id(rsID).make(); - } else if ( walker.alwaysAppendDbsnpId() && vc.getID().indexOf(rsID) == -1 ) { - final String newRsID = vc.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID; - vc = new VariantContextBuilder(vc).id(newRsID).make(); - } - } - } else { - boolean overlapsComp = false; - for ( VariantContext comp : tracker.getValues(dbSet.getKey(), loc) ) { - if ( !comp.isFiltered() && ( !requireStrictAlleleMatch || comp.getAlleles().equals(vc.getAlleles()) ) ) { - overlapsComp = true; - break; - } - } - if ( overlapsComp ) - infoAnnotations.put(dbSet.getValue(), overlapsComp); - } - } - - return vc; + private VariantContext annotateDBs(final RefMetaDataTracker tracker, VariantContext vc) { + return variantOverlapAnnotator.annotateOverlaps(tracker, variantOverlapAnnotator.annotateRsID(tracker, vc)); } private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map infoAnnotations) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java new file mode 100644 index 000000000..07af4bd74 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java @@ -0,0 +1,224 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.vcf.VCFConstants; + +import java.util.*; + +/** + * Annotate the ID field and attribute overlap FLAGs for a VariantContext against a RefMetaDataTracker or a list + * of VariantContexts + */ +public final class VariantOverlapAnnotator { + final RodBinding dbSNPBinding; + final Map, String> overlapBindings; + final GenomeLocParser genomeLocParser; + + /** + * Create a new VariantOverlapAnnotator without overall bindings + * + * @see #VariantOverlapAnnotator(org.broadinstitute.sting.commandline.RodBinding, java.util.Map, org.broadinstitute.sting.utils.GenomeLocParser) + */ + public VariantOverlapAnnotator(RodBinding dbSNPBinding, GenomeLocParser genomeLocParser) { + this(dbSNPBinding, Collections., String>emptyMap(), genomeLocParser); + } + + /** + * Create a new VariantOverlapAnnotator + * + * @param dbSNPBinding the RodBinding to use for updating ID field values, or null if that behavior isn't desired + * @param overlapBindings a map of RodBindings / name to use for overlap annotation. Each binding will be used to + * add name => true for variants that overlap with variants found to a + * RefMetaDataTracker at each location. Can be empty but not null + * @param genomeLocParser the genome loc parser we'll use to create GenomeLocs for VariantContexts + */ + public VariantOverlapAnnotator(RodBinding dbSNPBinding, Map, String> overlapBindings, GenomeLocParser genomeLocParser) { + if ( overlapBindings == null ) throw new IllegalArgumentException("overlapBindings cannot be null"); + if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); + + this.dbSNPBinding = dbSNPBinding; + this.overlapBindings = overlapBindings; + this.genomeLocParser = genomeLocParser; + } + + /** + * Update rsID in vcToAnnotate with rsIDs from dbSNPBinding fetched from tracker + * @see #annotateOverlap(java.util.List, String, org.broadinstitute.variant.variantcontext.VariantContext) + * + * @param tracker non-null tracker, which we will use to update the rsID of vcToAnnotate + * for VariantContexts bound to dbSNPBinding that start at vcToAnnotate + * @param vcToAnnotate a variant context to annotate + * @return a VariantContext (may be == to vcToAnnotate) with updated rsID value + */ + public VariantContext annotateRsID(final RefMetaDataTracker tracker, final VariantContext vcToAnnotate) { + if ( dbSNPBinding != null ) { + final GenomeLoc loc = getLoc(vcToAnnotate); + return annotateRsID(tracker.getValues(dbSNPBinding, loc), vcToAnnotate); + } else { + return vcToAnnotate; + } + } + + /** + * Update rsID of vcToAnnotate with rsID match found in vcsAtLoc, if one exists + * + * @param vcsAtLoc a list of variant contexts starting at this location to use as sources for rsID values + * @param vcToAnnotate a variant context to annotate + * @return a VariantContext (may be == to vcToAnnotate) with updated rsID value + */ + public VariantContext annotateRsID(final List vcsAtLoc, final VariantContext vcToAnnotate ) { + final String rsID = getRsID(vcsAtLoc, vcToAnnotate); + + // add the ID if appropriate + if ( rsID != null ) { + final VariantContextBuilder vcb = new VariantContextBuilder(vcToAnnotate); + + if ( ! vcToAnnotate.hasID() ) { + return vcb.id(rsID).make(); + } else if ( ! vcToAnnotate.getID().contains(rsID) ) { + return vcb.id(vcToAnnotate.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID).make(); + } // falling through to return VC lower down + } + + // nothing to do, just return vc + return vcToAnnotate; + } + + private GenomeLoc getLoc(final VariantContext vc) { + return genomeLocParser.createGenomeLoc(vc); + } + + /** + * Add overlap attributes to vcToAnnotate against all overlapBindings in tracker + * + * @see #annotateOverlap(java.util.List, String, org.broadinstitute.variant.variantcontext.VariantContext) + * for more information + * + * @param tracker non-null tracker, which we will use to update the rsID of vcToAnnotate + * for VariantContexts bound to dbSNPBinding that start at vcToAnnotate + * @param vcToAnnotate a variant context to annotate + * @return a VariantContext (may be == to vcToAnnotate) with updated overlaps update fields value + */ + public VariantContext annotateOverlaps(final RefMetaDataTracker tracker, final VariantContext vcToAnnotate) { + if ( overlapBindings.isEmpty() ) return vcToAnnotate; + + VariantContext annotated = vcToAnnotate; + final GenomeLoc loc = getLoc(vcToAnnotate); + for ( final Map.Entry, String> overlapBinding : overlapBindings.entrySet() ) { + annotated = annotateOverlap(tracker.getValues(overlapBinding.getKey(), loc), overlapBinding.getValue(), vcToAnnotate); + } + + return annotated; + } + + /** + * Add overlaps flag attributes to vcToAnnotate binding overlapTestVCs.getSource() => true if + * an overlapping variant context can be found in overlapTestVCs with vcToAnnotate + * + * Overlaps here means that the reference alleles are the same and at least one alt + * allele in vcToAnnotate is equals to one of the alt alleles in overlapTestVCs + * + * @param overlapTestVCs a non-null list of potential overlaps that start at vcToAnnotate + * @param attributeKey the key to set to true in the attribute map for vcToAnnotate if it overlaps + * @param vcToAnnotate a non-null VariantContext to annotate + * @return + */ + public VariantContext annotateOverlap(final List overlapTestVCs, final String attributeKey, VariantContext vcToAnnotate) { + if ( overlapBindings.isEmpty() ) return vcToAnnotate; + + final boolean overlaps = overlaps(overlapTestVCs, vcToAnnotate); + if ( overlaps ) { + return new VariantContextBuilder(vcToAnnotate).attribute(attributeKey, true).make(); + } else { + return vcToAnnotate; + } + } + + /** + * Returns the ID field of the first VariantContext in rsIDSourceVCs that has the same reference allele + * as vcToAnnotate and all of the alternative alleles in vcToAnnotate. + * + * Doesn't require vcToAnnotate to be a complete match, so + * + * A/C/G in VC in rsIDSourceVCs + * + * would match the a VC with A/C but not A/T. Also we don't require all alleles to match + * so we would also match A/C/T to A/C/G. + * + * Will only match rsIDSourceVCs that aren't failing filters. + * + * @param rsIDSourceVCs a non-null list of potential overlaps that start at vcToAnnotate + * @param vcToAnnotate a non-null VariantContext to annotate + * @return a String to use for the rsID from rsIDSourceVCs if one matches, or null if none matches + */ + private String getRsID(final List rsIDSourceVCs, final VariantContext vcToAnnotate) { + if ( rsIDSourceVCs == null ) throw new IllegalArgumentException("rsIDSourceVCs cannot be null"); + if ( vcToAnnotate == null ) throw new IllegalArgumentException("vcToAnnotate cannot be null"); + + for ( final VariantContext vcComp : rsIDSourceVCs ) { + if ( vcComp.isFiltered() ) continue; // don't process any failed VCs + + if ( ! vcComp.getChr().equals(vcToAnnotate.getChr()) || vcComp.getStart() != vcToAnnotate.getStart() ) + throw new IllegalArgumentException("source rsID VariantContext " + vcComp + " doesn't start at the same position as vcToAnnotate " + vcToAnnotate); + + if ( vcToAnnotate.getReference().equals(vcComp.getReference()) ) { + for ( final Allele allele : vcToAnnotate.getAlternateAlleles() ) { + if ( vcComp.getAlternateAlleles().contains(allele) ) + return vcComp.getID(); + } + } + } + + return null; + } + + /** + * Does vcToAnnotate overlap with any of the records in potentialOverlaps? + * + * @param potentialOverlaps a non-null list of potential overlaps that start at vcToAnnotate + * @param vcToAnnotate a non-null VariantContext to annotate + * @return true if vcToAnnotate overlaps (position and all alt alleles) with some variant in potentialOverlaps + */ + private boolean overlaps(final List potentialOverlaps, final VariantContext vcToAnnotate) { + return getRsID(potentialOverlaps, vcToAnnotate) != null; + } + + /** + * Get the collection of the RodBinding names for those being used for overlap detection + * @return a non-null collection of Strings + */ + public Collection getOverlapNames() { + return overlapBindings.values(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 60809134a..dbb68961f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantOverlapAnnotator; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; @@ -112,24 +113,21 @@ public class VariantsToVCF extends RodWalker { // for dealing with indels in hapmap CloseableIterator dbsnpIterator = null; + VariantOverlapAnnotator variantOverlapAnnotator = null; public void initialize() { vcfwriter = VariantContextWriterFactory.sortOnTheFly(baseWriter, 40, false); + variantOverlapAnnotator = new VariantOverlapAnnotator(dbsnp.dbsnp, getToolkit().getGenomeLocParser()); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) ) return 0; - String rsID = dbsnp == null ? null : GATKVCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP); - Collection contexts = getVariantContexts(tracker, ref); for ( VariantContext vc : contexts ) { VariantContextBuilder builder = new VariantContextBuilder(vc); - if ( rsID != null && vc.emptyID() ) { - builder.id(rsID).make(); - } // set the appropriate sample name if necessary if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) { @@ -137,7 +135,8 @@ public class VariantsToVCF extends RodWalker { builder.genotypes(g); } - writeRecord(builder.make(), tracker, ref.getLocus()); + final VariantContext withID = variantOverlapAnnotator.annotateRsID(tracker, builder.make()); + writeRecord(withID, tracker, ref.getLocus()); } return 1; diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java index 0fba432e7..aa2e92559 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java @@ -149,21 +149,6 @@ public class GATKVCFUtils { return VCFUtils.withUpdatedContigs(header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary()); } - public static String rsIDOfFirstRealVariant(List VCs, VariantContext.Type type) { - if ( VCs == null ) - return null; - - String rsID = null; - for ( VariantContext vc : VCs ) { - if ( vc.getType() == type ) { - rsID = vc.getID(); - break; - } - } - - return rsID; - } - /** * Utility class to read all of the VC records from a file * diff --git a/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java index 3225a128c..56725147e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java @@ -256,7 +256,6 @@ public class ReadMetricsUnitTest extends BaseTest { } windowMaker.close(); } - traverseActiveRegions.endTraversal(walker, 0); Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java similarity index 98% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java index bf4d36d92..784bd727e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java @@ -49,7 +49,7 @@ import java.util.*; /** * @author depristo */ -public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { +public class IntervalReferenceOrderedViewUnitTest extends BaseTest { private static int startingChr = 1; private static int endingChr = 2; private static int readCount = 100; @@ -285,7 +285,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { Collections.sort(intervals); final GenomeLoc span = span(intervals); - final ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(genomeLocParser, span, names, iterators); + final IntervalReferenceOrderedView view = new IntervalReferenceOrderedView(genomeLocParser, span, names, iterators); if ( testStateless ) { // test each tracker is well formed, as each is created diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index fad632cfd..1d39f43c6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -97,7 +97,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.emptyList()); ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); - RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",10), null); + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",10)); Assert.assertEquals(tracker.getValues(Feature.class).size(), 0, "The tracker should not have produced any data"); } @@ -115,7 +115,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.singletonList(dataSource)); ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); - RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); TableFeature datum = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest")); Assert.assertEquals(datum.get("COL1"),"C","datum parameter for COL1 is incorrect"); @@ -141,7 +141,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Arrays.asList(dataSource1,dataSource2)); ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); - RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); TableFeature datum1 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest1")); Assert.assertEquals(datum1.get("COL1"),"C","datum1 parameter for COL1 is incorrect"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 1f5cd6d0e..e4b6c37cc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -405,8 +405,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam)) t.traverse(walker, dataProvider, 0); - t.endTraversal(walker, 0); - return walker.mappedActiveRegions; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index e8840c39f..5b52d4e33 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.*; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.qc.CountReads; @@ -47,6 +48,7 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.PrintStream; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static org.testng.Assert.fail; @@ -146,7 +148,7 @@ public class TraverseReadsUnitTest extends BaseTest { fail("Shard == null"); } - ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null); + ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null, Collections.emptyList()); accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator); dataProvider.close(); }