From e4ec899a879503f2e56d836d5b8cbe1b18ffc370 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 29 Jan 2013 15:51:07 -0500 Subject: [PATCH 01/13] First pass at adding unit tests for the RR framework: I have added 3 tests and all 3 uncovered RR bugs! One of the fixes was critical: SlidingWindow was not converting between global and relative positions correctly. Besides not being correct, it was resulting in a massive slow down of the RR traversal. That fix definitely breaks at least one of the integration tests, but it's not worth changing md5s now because I'll be changing things all over RR for the next few days, so I am going to let that test fail indefinitely until I can confirm general correctness of the tool. --- .../reducereads/HeaderElement.java | 8 +- .../reducereads/SimpleGenomeLoc.java | 31 ++- .../reducereads/SlidingWindow.java | 42 +++- .../reducereads/HeaderElementUnitTest.java | 134 +++++++++++ .../reducereads/SimpleGenomeLocUnitTest.java | 106 ++++++++ .../reducereads/SlidingWindowUnitTest.java | 227 ++++++++++++++++++ 6 files changed, 521 insertions(+), 27 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLocUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 1e7805fce..13d3d1b4c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -244,7 +244,7 @@ public class HeaderElement { * * @return whether or not the HeaderElement is variant due to excess insertions */ - private boolean isVariantFromMismatches(double minVariantProportion) { + protected boolean isVariantFromMismatches(double minVariantProportion) { BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon); return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion); @@ -256,11 +256,11 @@ public class HeaderElement { * * @return true if we had more soft clipped bases contributing to this site than matches/mismatches. */ - private boolean isVariantFromSoftClips() { - return nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases); + protected boolean isVariantFromSoftClips() { + return nSoftClippedBases > 0 && nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases); } - private boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) { + protected boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) { return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java index a3ebb132f..85a1f8df1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java @@ -78,6 +78,13 @@ public class SimpleGenomeLoc extends GenomeLoc { return finished; } + /** + * Merges 2 *contiguous* locs into 1 + * + * @param a SimpleGenomeLoc #1 + * @param b SimpleGenomeLoc #2 + * @return one merged loc + */ @Requires("a != null && b != null") public static SimpleGenomeLoc merge(SimpleGenomeLoc a, SimpleGenomeLoc b) throws ReviewedStingException { if(GenomeLoc.isUnmapped(a) || GenomeLoc.isUnmapped(b)) { @@ -88,7 +95,6 @@ public class SimpleGenomeLoc extends GenomeLoc { throw new ReviewedStingException("The two genome locs need to be contiguous"); } - return new SimpleGenomeLoc(a.getContig(), a.contigIndex, Math.min(a.getStart(), b.getStart()), Math.max(a.getStop(), b.getStop()), @@ -101,19 +107,22 @@ public class SimpleGenomeLoc extends GenomeLoc { * @param sortedLocs a sorted list of contiguous locs * @return one merged loc */ + @Requires("sortedLocs != null") public static SimpleGenomeLoc merge(SortedSet sortedLocs) { - SimpleGenomeLoc previousLoc = null; - for (SimpleGenomeLoc loc : sortedLocs) { - if (loc.isUnmapped()) { + SimpleGenomeLoc result = null; + + for ( SimpleGenomeLoc loc : sortedLocs ) { + if ( loc.isUnmapped() ) throw new ReviewedStingException("Tried to merge unmapped genome locs"); - } - if (previousLoc != null && !previousLoc.contiguousP(loc)) { + + if ( result == null ) + result = loc; + else if ( !result.contiguousP(loc) ) throw new ReviewedStingException("The genome locs need to be contiguous"); - } - previousLoc = loc; + else + result = merge(result, loc); } - SimpleGenomeLoc firstLoc = sortedLocs.first(); - SimpleGenomeLoc lastLoc = sortedLocs.last(); - return merge(firstLoc, lastLoc); + + return result; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index e91120f1c..57a11f640 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; @@ -135,6 +136,15 @@ public class SlidingWindow { return header.isEmpty() ? -1 : header.peek().getLocation(); } + // for testing only + protected SlidingWindow(final String contig, final int contigIndex, final int startLocation) { + this.contig = contig; + this.contigIndex = contigIndex; + nContigs = 1; + this.windowHeader = new LinkedList(); + windowHeader.addFirst(new HeaderElement(startLocation)); + this.readsInWindow = new TreeSet(); + } public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) { this.contextSize = contextSize; @@ -193,14 +203,17 @@ public class SlidingWindow { } /** - * returns the next complete or incomplete variant region between 'from' (inclusive) and 'to' (exclusive) + * Returns the next complete (or incomplete if closeLastRegion is true) variant region between 'from' (inclusive) and 'to' (exclusive) + * but converted to global coordinates. * - * @param from beginning window header index of the search window (inclusive) - * @param to end window header index of the search window (exclusive) + * @param from beginning window header index of the search window (inclusive); note that this uses local coordinates + * @param to end window header index of the search window (exclusive); note that this uses local coordinates * @param variantSite boolean array with true marking variant regions - * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. + * @param closeLastRegion if the last index is variant (so it's an incomplete region), should we close (and return as an interval) the location or ignore it? + * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. All coordinates returned are global. */ - private SimpleGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean forceClose) { + @Requires({"from >= 0", "from <= to", "to <= variantSite.length"}) + private SimpleGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean closeLastRegion) { boolean foundStart = false; final int windowHeaderStart = getStartLocation(windowHeader); int variantRegionStartIndex = 0; @@ -215,22 +228,27 @@ public class SlidingWindow { } final int refStart = windowHeaderStart + variantRegionStartIndex; final int refStop = windowHeaderStart + to - 1; - return (foundStart && forceClose) ? new SimpleGenomeLoc(contig, contigIndex, refStart, refStop, true) : null; + return (foundStart && closeLastRegion) ? new SimpleGenomeLoc(contig, contigIndex, refStart, refStop, true) : null; } /** * Creates a list with all the complete and incomplete variant regions within 'from' (inclusive) and 'to' (exclusive) * - * @param from beginning window header index of the search window (inclusive) - * @param to end window header index of the search window (exclusive) + * @param from beginning window header index of the search window (inclusive); note that this uses local coordinates + * @param to end window header index of the search window (exclusive); note that this uses local coordinates * @param variantSite boolean array with true marking variant regions - * @return a list with start/stops of variant regions following findNextVariantRegion description + * @return a list with start/stops of variant regions following findNextVariantRegion description in global coordinates */ - private CompressionStash findVariantRegions(int from, int to, boolean[] variantSite, boolean forceClose) { + @Requires({"from >= 0", "from <= to", "to <= variantSite.length"}) + @Ensures("result != null") + protected CompressionStash findVariantRegions(int from, int to, boolean[] variantSite, boolean closeLastRegion) { + final int windowHeaderStart = getStartLocation(windowHeader); + CompressionStash regions = new CompressionStash(); int index = from; while(index < to) { - SimpleGenomeLoc result = findNextVariantRegion(index, to, variantSite, forceClose); + // returns results in global coordinates + SimpleGenomeLoc result = findNextVariantRegion(index, to, variantSite, closeLastRegion); if (result == null) break; @@ -238,7 +256,7 @@ public class SlidingWindow { if (!result.isFinished()) break; - index = result.getStop() + 1; + index = result.getStop() - windowHeaderStart + 1; // go back to local coordinates } return regions; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java new file mode 100644 index 000000000..b6af954a0 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java @@ -0,0 +1,134 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +public class HeaderElementUnitTest extends BaseTest { + + private class HETest { + public byte base, baseQual, insQual, delQual; + public int MQ; + public boolean isClip; + + private HETest(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int MQ, final boolean isClip) { + this.base = base; + this.baseQual = baseQual; + this.insQual = insQual; + this.delQual = delQual; + this.MQ = MQ; + this.isClip = isClip; + } + } + + private static final byte byteA = (byte)'A'; + private static final byte byte10 = (byte)10; + private static final byte byte20 = (byte)20; + private static final int minBaseQual = 20; + private static final int minMappingQual = 20; + + @DataProvider(name = "data") + public Object[][] createData() { + List tests = new ArrayList(); + + tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 20, false)}); + tests.add(new Object[]{new HETest(byteA, byte10, byte20, byte20, 20, false)}); + tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 10, false)}); + tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 20, true)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "data", enabled = true) + public void testHE(HETest test) { + + HeaderElement headerElement = new HeaderElement(1000, 0); + + // first test that if we add and then remove it, we have no data + headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip); + headerElement.addInsertionToTheRight(); + headerElement.removeBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip); + headerElement.removeInsertionToTheRight(); + testHeaderIsEmpty(headerElement); + + // now, test that the data was added as expected + for ( int i = 0; i < 10; i++ ) + headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip); + testHeaderData(headerElement, test); + + // test the insertion adding functionality + for ( int i = 0; i < 10; i++ ) + headerElement.addInsertionToTheRight(); + Assert.assertEquals(headerElement.numInsertionsToTheRight(), 10); + } + + private void testHeaderIsEmpty(final HeaderElement headerElement) { + Assert.assertFalse(headerElement.hasConsensusData()); + Assert.assertFalse(headerElement.hasFilteredData()); + Assert.assertFalse(headerElement.hasInsertionToTheRight()); + Assert.assertTrue(headerElement.isEmpty()); + Assert.assertEquals(headerElement.getRMS(), 0.0); + } + + private void testHeaderData(final HeaderElement headerElement, final HETest test) { + Assert.assertEquals(headerElement.getRMS(), (double)test.MQ); + Assert.assertEquals(headerElement.isVariantFromSoftClips(), test.isClip); + Assert.assertFalse(headerElement.isEmpty()); + Assert.assertFalse(headerElement.hasInsertionToTheRight()); + Assert.assertEquals(headerElement.hasConsensusData(), headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual)); + Assert.assertEquals(headerElement.hasFilteredData(), !headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual)); + Assert.assertFalse(headerElement.isVariantFromMismatches(0.05)); + Assert.assertEquals(headerElement.isVariant(0.05, 0.05), test.isClip); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLocUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLocUnitTest.java new file mode 100644 index 000000000..cbac7e3c1 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLocUnitTest.java @@ -0,0 +1,106 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class SimpleGenomeLocUnitTest extends BaseTest { + + private static final SimpleGenomeLoc loc1 = new SimpleGenomeLoc("1", 0, 10, 20, false); + private static final SimpleGenomeLoc loc2 = new SimpleGenomeLoc("1", 0, 21, 30, false); + private static final SimpleGenomeLoc loc3 = new SimpleGenomeLoc("1", 0, 31, 40, false); + + private class SGLTest { + public List locs; + + private SGLTest(final List locs) { + this.locs = locs; + } + } + + @DataProvider(name = "SGLtest") + public Object[][] createFindVariantRegionsData() { + List tests = new ArrayList(); + + tests.add(new Object[]{new SGLTest(Arrays.asList(loc1))}); + tests.add(new Object[]{new SGLTest(Arrays.asList(loc1, loc2))}); + tests.add(new Object[]{new SGLTest(Arrays.asList(loc1, loc2, loc3))}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SGLtest", enabled = true) + public void testSimpleGenomeLoc(SGLTest test) { + testMerge(test.locs); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testNotContiguousLocs() { + final List locs = new ArrayList(1); + locs.add(loc1); + locs.add(loc3); + testMerge(locs); + } + + private void testMerge(final List locs) { + SimpleGenomeLoc result1 = locs.get(0); + for ( int i = 1; i < locs.size(); i++ ) + result1 = SimpleGenomeLoc.merge(result1, locs.get(i)); + + SimpleGenomeLoc result2 = SimpleGenomeLoc.merge(new TreeSet(locs)); + Assert.assertEquals(result1, result2); + Assert.assertEquals(result1.getStart(), locs.get(0).getStart()); + Assert.assertEquals(result1.getStop(), locs.get(locs.size() - 1).getStop()); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java new file mode 100644 index 000000000..cfb8c53b4 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -0,0 +1,227 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class SlidingWindowUnitTest extends BaseTest { + + ////////////////////////////////////////////////////////////////////////////////////// + //// This section tests the findVariantRegions() method and related functionality //// + ////////////////////////////////////////////////////////////////////////////////////// + + private static final int variantRegionLength = 1000; + private static final int globalStartPosition = 1000000; + private static final SimpleGenomeLoc loc90to95 = new SimpleGenomeLoc("1", 0, 1000090, 1000095, false); + private static final SimpleGenomeLoc loc96to99 = new SimpleGenomeLoc("1", 0, 1000096, 1000099, false); + private static final SimpleGenomeLoc loc100to110 = new SimpleGenomeLoc("1", 0, 1000100, 1000110, false); + private static final SimpleGenomeLoc loc999 = new SimpleGenomeLoc("1", 0, 1000999, 1000999, false); + + private class FindVariantRegionsTest { + public List locs, expectedResult; + public boolean[] variantRegionBitset; + + private FindVariantRegionsTest(final List locs) { + this.locs = locs; + this.expectedResult = locs; + variantRegionBitset = createBitset(locs); + } + + private FindVariantRegionsTest(final List locs, final List expectedResult) { + this.locs = locs; + this.expectedResult = expectedResult; + variantRegionBitset = createBitset(locs); + } + } + + private static boolean[] createBitset(final List locs) { + boolean[] variantRegionBitset = new boolean[variantRegionLength]; + for ( SimpleGenomeLoc loc : locs ) { + final int stop = loc.getStop() - globalStartPosition; + for ( int i = loc.getStart() - globalStartPosition; i <= stop; i++ ) + variantRegionBitset[i] = true; + } + return variantRegionBitset; + } + + @DataProvider(name = "findVariantRegions") + public Object[][] createFindVariantRegionsData() { + List tests = new ArrayList(); + + tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95))}); + tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95, loc100to110))}); + tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95, loc96to99, loc100to110), Arrays.asList(new SimpleGenomeLoc("1", 0, 1000090, 1000110, false)))}); + tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95, loc999))}); + tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc999))}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "findVariantRegions", enabled = true) + public void testFindVariantRegions(FindVariantRegionsTest test) { + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); + final CompressionStash locs = slidingWindow.findVariantRegions(0, variantRegionLength, test.variantRegionBitset, true); + int index = 0; + for ( final SimpleGenomeLoc loc : locs ) { + Assert.assertTrue(loc.equals(test.expectedResult.get(index++))); + } + } + + @Test(enabled = true) + public void testNoClosingRegions() { + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); + final CompressionStash locs = slidingWindow.findVariantRegions(0, variantRegionLength, createBitset(Arrays.asList(loc90to95, loc999)), false); + Assert.assertEquals(locs.size(), 1); + Assert.assertEquals(locs.iterator().next(), loc90to95); + } + + + + + + + + + + + + + + /* + + private static class DownsamplingReadsIteratorTest extends TestDataProvider { + private DownsamplingReadsIterator downsamplingIter; + private int targetCoverage; + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; + + public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) { + super(DownsamplingReadsIteratorTest.class); + + this.stream = stream; + this.targetCoverage = targetCoverage; + + setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + targetCoverage, + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); + } + + public void run() { + streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage); + downsamplingIter = new DownsamplingReadsIterator(stream.getStingSAMIterator(), new SimplePositionalDownsampler(targetCoverage)); + + streamAnalyzer.analyze(downsamplingIter); + + // Check whether the observed properties of the downsampled stream are what they should be + streamAnalyzer.validate(); + + // Allow memory used by this test to be reclaimed + stream = null; + streamAnalyzer = null; + downsamplingIter = null; + } + } + + @DataProvider(name = "DownsamplingReadsIteratorTestDataProvider") + public Object[][] createDownsamplingReadsIteratorTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + + // Values that don't vary across tests + int targetCoverage = 10; + int minReadLength = 50; + int maxReadLength = 100; + int minDistanceBetweenStacks = 1; + int maxDistanceBetweenStacks = maxReadLength + 1; + + GenomeAnalysisEngine.resetRandomGenerator(); + + // brute force testing! + for ( int numContigs : Arrays.asList(1, 2, 5) ) { + for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) { + for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) { + // Only interested in sane read stream configurations here + if ( minReadsPerStack <= maxReadsPerStack ) { + new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads), + targetCoverage); + } + } + } + } + } + } + + return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider") + public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } + + */ +} From 4aaef495c6f6b1ee9a52877aa78ca94f14d80c49 Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Tue, 29 Jan 2013 16:33:12 -0500 Subject: [PATCH 02/13] correct the help message --- public/java/src/org/broadinstitute/sting/tools/CatVariants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java index 07a90bb11..93246fd6f 100644 --- a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java +++ b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java @@ -51,7 +51,7 @@ import java.util.*; /** * - * Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.AppendVariants [sorted (optional)]"); + * Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants [sorted (optional)]"); * The input files can be of type: VCF (ends in .vcf or .VCF)"); * BCF2 (ends in .bcf or .BCF)"); * Output file must be vcf or bcf file (.vcf or .bcf)"); From a536e1da848fa8407557f0f850c78475255e9e8f Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 29 Jan 2013 13:57:39 -0500 Subject: [PATCH 04/13] Move some VCF/VariantContext methods back to the GATK based on feedback -Moved some of the more specialized / complex VariantContext and VCF utility methods back to the GATK. -Due to this re-shuffling, was able to return things like the Pair class back to the GATK as well. --- .../gatk/walkers/annotator/RankSumTest.java | 2 +- .../annotator/TandemRepeatAnnotator.java | 6 +- .../gatk/walkers/bqsr/BaseRecalibrator.java | 2 +- .../reducereads/MultiSampleCompressor.java | 2 +- .../reducereads/SingleSampleCompressor.java | 2 +- .../reducereads/SlidingWindow.java | 2 +- .../genotyper/ConsensusAlleleCounter.java | 5 +- .../GeneralPloidyGenotypeLikelihoods.java | 2 +- ...dyGenotypeLikelihoodsCalculationModel.java | 2 +- .../genotyper/UnifiedArgumentCollection.java | 4 +- .../walkers/genotyper/UnifiedGenotyper.java | 4 +- .../genotyper/UnifiedGenotyperEngine.java | 7 +- .../genotyper/afcalc/DiploidExactAFCalc.java | 5 +- .../walkers/genotyper/afcalc/ExactAFCalc.java | 4 +- .../afcalc/GeneralPloidyExactAFCalc.java | 5 +- .../afcalc/OriginalDiploidExactAFCalc.java | 2 +- .../haplotypecaller/GenotypingEngine.java | 5 +- .../haplotypecaller/HaplotypeCaller.java | 5 +- .../gatk/walkers/indels/IndelRealigner.java | 2 +- .../walkers/phasing/PhaseByTransmission.java | 3 +- .../validation/GenotypeAndValidate.java | 4 +- .../ValidationSiteSelector.java | 4 +- .../variantutils/RegenotypeVariants.java | 3 +- .../utils/recalibration/RecalDatumNode.java | 2 +- .../sting/utils/recalibration/RecalUtils.java | 2 +- .../recalibration/RecalibrationReport.java | 2 +- .../covariates/RepeatCovariate.java | 10 +- .../RepeatUnitAndLengthCovariate.java | 12 - .../covariates/RepeatUnitCovariate.java | 14 - ...eralPloidyGenotypeLikelihoodsUnitTest.java | 2 +- .../afcalc/AFCalcPerformanceUnitTest.java | 2 +- .../StratificationManagerUnitTest.java | 2 +- .../ConcordanceMetricsUnitTest.java | 2 +- .../RepeatCovariatesUnitTest.java | 44 +- .../sting/commandline/ParsingEngine.java | 2 +- .../providers/RODMetaDataContainer.java | 2 +- .../sting/gatk/executive/Accumulator.java | 2 +- .../gatk/refdata/VariantContextAdaptors.java | 3 +- .../gatk/refdata/tracks/RMDTrackBuilder.java | 2 +- .../sting/gatk/walkers/Walker.java | 2 +- .../annotator/VariantAnnotatorEngine.java | 3 +- .../walkers/coverage/DepthOfCoverage.java | 2 +- .../walkers/coverage/GCContentByInterval.java | 2 +- .../fasta/FastaAlternateReferenceMaker.java | 2 +- .../walkers/fasta/FastaReferenceMaker.java | 2 +- .../sting/gatk/walkers/qc/CountIntervals.java | 2 +- .../sting/gatk/walkers/qc/CountRODs.java | 2 +- .../sting/gatk/walkers/qc/CountRODsByRef.java | 2 +- .../gatk/walkers/qc/CountTerminusEvent.java | 2 +- .../gatk/walkers/readutils/ClipReads.java | 2 +- .../gatk/walkers/varianteval/VariantEval.java | 5 +- .../varianteval/VariantEvalReportWriter.java | 2 +- .../stratifications/TandemRepeat.java | 4 +- .../manager/StratificationManager.java | 2 +- .../walkers/variantutils/CombineVariants.java | 23 +- .../variantutils/GenotypeConcordance.java | 2 +- .../walkers/variantutils/SelectHeaders.java | 4 +- .../walkers/variantutils/SelectVariants.java | 5 +- .../VariantValidationAssessor.java | 4 +- .../walkers/variantutils/VariantsToTable.java | 3 +- .../walkers/variantutils/VariantsToVCF.java | 2 +- .../sting/tools/CatVariants.java | 2 +- .../sting/utils/MannWhitneyU.java | 2 +- .../sting/utils/SWPairwiseAlignment.java | 2 +- .../sting/utils/SampleUtils.java | 10 +- .../broadinstitute/sting/utils/baq/BAQ.java | 2 +- .../utils/collections}/Pair.java | 2 +- .../sting/utils/duplicates/DupUtils.java | 2 +- .../sting/utils/fragments/FragmentUtils.java | 2 +- .../help/GenericDocumentationHandler.java | 2 +- .../sting/utils/interval/IntervalUtils.java | 2 +- .../sting/utils/sam/ReadUtils.java | 2 +- .../sting/utils/variant/GATKVCFUtils.java | 47 +- .../variant/GATKVariantContextUtils.java | 948 ++++++++++++++++++ .../variant/utils/BaseUtils.java | 6 +- .../variant/utils/GeneralUtils.java | 7 - .../variant/variantcontext/CommonInfo.java | 2 +- .../variantcontext/VariantContextUtils.java | 940 +---------------- .../broadinstitute/variant/vcf/VCFUtils.java | 50 - .../org/broadinstitute/sting/WalkerTest.java | 2 +- .../sting/utils/MWUnitTest.java | 2 +- .../BandPassActivityProfileUnitTest.java | 6 +- .../GATKVariantContextUtilsUnitTest.java} | 332 +++--- .../variant/VariantContextBenchmark.java | 6 +- .../variant/VariantBaseTest.java | 25 + .../VariantContextTestProvider.java | 51 +- .../VariantContextUnitTest.java | 74 -- 87 files changed, 1408 insertions(+), 1386 deletions(-) rename public/java/src/org/broadinstitute/{variant/utils => sting/utils/collections}/Pair.java (98%) rename public/java/test/org/broadinstitute/{variant/variantcontext/VariantContextUtilsUnitTest.java => sting/utils/variant/GATKVariantContextUtilsUnitTest.java} (79%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 6f965227c..ec107512a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -57,7 +57,7 @@ import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.variant.variantcontext.Allele; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java index a6b713551..2e0e759c2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java @@ -53,8 +53,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.utils.Pair; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.vcf.VCFHeaderLineCount; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -79,7 +79,7 @@ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements Standa if ( !vc.isIndel()) return null; - Pair,byte[]> result = VariantContextUtils.getNumTandemRepeatUnits(vc, ref.getForwardBases()); + Pair,byte[]> result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, ref.getForwardBases()); if (result == null) return null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 002bdc39f..2df5fefa8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -63,7 +63,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 50d741f13..6818669df 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 5b08e99a0..036d2782a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -46,7 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 57a11f640..7ce606f20 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -53,7 +53,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java index 90904ab29..ddf47805f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -53,7 +53,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -288,7 +289,7 @@ public class ConsensusAlleleCounter { if (vcs.isEmpty()) return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion - final VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); return mergedVC.getAlleles(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java index 6a5fbce39..cf144a735 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java @@ -51,7 +51,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACcounts; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index 23804bb23..3e0437edb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -56,7 +56,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.variant.variantcontext.*; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index e97c92309..a7f90ebec 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -49,8 +49,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @@ -172,7 +172,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy */ @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) - public int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY; + public int samplePloidy = GATKVariantContextUtils.DEFAULT_PLOIDY; @Hidden @Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 46ac10d90..d16ece4fd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -61,7 +61,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -304,7 +304,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotyperEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site")); // add the pool values for each genotype - if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) { + if (UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY) { headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample")); headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction, in the same order as listed, for each individual sample")); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index b1aaf8190..8f6097661 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -61,6 +61,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -134,7 +135,7 @@ public class UnifiedGenotyperEngine { // --------------------------------------------------------------------------------------------------------- @Requires({"toolkit != null", "UAC != null"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY); + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), GATKVariantContextUtils.DEFAULT_PLOIDY); } @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) @@ -525,7 +526,7 @@ public class UnifiedGenotyperEngine { // if we are subsetting alleles (either because there were too many or because some were not polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync - vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); + vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall); if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations @@ -662,7 +663,7 @@ public class UnifiedGenotyperEngine { private void determineGLModelsToUse() { String modelPrefix = ""; - if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY ) + if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY ) modelPrefix = GPSTRING; if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 68f57a300..170b6e250 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import java.util.*; @@ -105,7 +106,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { alleles.add(vc.getReference()); alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles())); builder.alleles(alleles); - builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); return builder.make(); } else { return vc; @@ -351,6 +352,6 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { final List allelesToUse, final boolean assignGenotypes, final int ploidy) { - return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); + return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java index cf6b67afd..3d28db159 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java @@ -47,10 +47,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypesContext; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; import java.util.ArrayList; @@ -92,7 +92,7 @@ abstract class ExactAFCalc extends AFCalc { if ( sample.hasLikelihoods() ) { double[] gls = sample.getLikelihoods().getAsVector(); - if ( MathUtils.sum(gls) < VariantContextUtils.SUM_GL_THRESH_NOCALL ) + if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) genotypeLikelihoods.add(gls); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 1e1652c68..f8c364e82 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.variant.variantcontext.*; @@ -553,7 +554,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { } // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL ) { + if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) { newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); } else { @@ -565,7 +566,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { gb.PL(newLikelihoods); // if we weren't asked to assign a genotype, then just no-call the sample - if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL ) + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) gb.alleles(NO_CALL_ALLELES); else assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index 92305fe4b..325d3b722 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -47,7 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index fa15eccdf..9aeffe966 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -57,6 +57,7 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.variant.variantcontext.*; @@ -173,7 +174,7 @@ public class GenotypingEngine { validatePriorityList( priorityList, eventsAtThisLoc ); // Merge the event to find a common reference representation - final VariantContext mergedVC = VariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { @@ -203,7 +204,7 @@ public class GenotypingEngine { VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call); if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! - annotatedCall = VariantContextUtils.reverseTrimAlleles(annotatedCall); + annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); } returnCalls.add( annotatedCall ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 455809a17..8b3eb9f1b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -72,6 +72,7 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; @@ -297,7 +298,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); @@ -307,7 +308,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.exactCallsLog = null; - UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); // initialize the output VCF header annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 1865cadea..851703648 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -63,7 +63,7 @@ import org.broadinstitute.sting.gatk.walkers.BAQMode; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 4510dfe55..80c49ff19 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -59,6 +59,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; @@ -444,7 +445,7 @@ public class PhaseByTransmission extends RodWalker, HashMa ArrayList rodNames = new ArrayList(); rodNames.add(variantCollection.variants.getName()); Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + Set vcfSamples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); //Get the trios from the families passed as ped setTrios(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java index 927e1e607..f0efb3cd9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java @@ -58,12 +58,12 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.variant.vcf.VCFUtils; @@ -327,7 +327,7 @@ public class GenotypeAndValidate extends RodWalker header = GATKVCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName()); - samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + samples = SampleUtils.getSampleList(header, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); Set headerLines = VCFUtils.smartMergeHeaders(header.values(), true); headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java index dcd7cd67b..ce44f546d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java @@ -54,12 +54,12 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import java.io.File; @@ -227,7 +227,7 @@ public class ValidationSiteSelector extends RodWalker { public void initialize() { // Get list of samples to include in the output Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); - TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java index 74ab8f073..c8fc27e6a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java @@ -61,6 +61,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.variant.vcf.*; @@ -115,7 +116,7 @@ public class RegenotypeVariants extends RodWalker implements T String trackName = variantCollection.variants.getName(); Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); final Set hInfo = new HashSet(); hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(trackName))); diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java index 9122c9ab6..637d9fb2d 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -51,7 +51,7 @@ import com.google.java.contract.Requires; import org.apache.commons.math.MathException; import org.apache.commons.math.stat.inference.ChiSquareTestImpl; import org.apache.log4j.Logger; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Collection; diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index e44f2e06e..699f26c5e 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -58,7 +58,7 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.collections.NestedHashMap; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index e3ab16639..2f9a38972 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -51,7 +51,7 @@ import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java index a836fbb5e..546bd6ac8 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java @@ -51,9 +51,9 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.variant.utils.Pair; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.collections.Pair; import java.util.Arrays; import java.util.HashMap; @@ -112,7 +112,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { // get backward repeat unit and # repeats byte[] backwardRepeatUnit = Arrays.copyOfRange(readBases, offset - str + 1, offset + 1); - maxBW = VariantContextUtils.findNumberofRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + maxBW = GATKVariantContextUtils.findNumberofRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); if (maxBW > 1) { bestBWRepeatUnit = backwardRepeatUnit.clone(); break; @@ -132,7 +132,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { // get forward repeat unit and # repeats byte[] forwardRepeatUnit = Arrays.copyOfRange(readBases, offset +1, offset+str+1); - maxFW = VariantContextUtils.findNumberofRepetitions(forwardRepeatUnit,Arrays.copyOfRange(readBases, offset+1, readBases.length), true); + maxFW = GATKVariantContextUtils.findNumberofRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true); if (maxFW > 1) { bestFWRepeatUnit = forwardRepeatUnit.clone(); break; @@ -150,7 +150,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { // but correct representation at that place might be (C)4. // Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add // representations to total - maxBW = VariantContextUtils.findNumberofRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + maxBW = GATKVariantContextUtils.findNumberofRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); maxRL = maxFW + maxBW; bestRepeatUnit = bestFWRepeatUnit; diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java index 5822b9e05..c4fdaad8b 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java @@ -48,18 +48,6 @@ package org.broadinstitute.sting.utils.recalibration.covariates; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.variant.utils.Pair; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; public class RepeatUnitAndLengthCovariate extends RepeatCovariate { diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java index ed843310d..b32feb9a3 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java @@ -46,20 +46,6 @@ package org.broadinstitute.sting.utils.recalibration.covariates; -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.variant.utils.Pair; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - /** * Created with IntelliJ IDEA. * User: rpoplin diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java index 08d333f8b..14dedebc4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java @@ -52,7 +52,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.variant.variantcontext.*; import org.testng.Assert; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java index 291489984..8deddc357 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java @@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.testng.Assert; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java index f144f9b59..aabcd374d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java @@ -52,7 +52,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manage import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java index 300dd633d..9c0567464 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.variant.variantcontext.Allele; diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java index e4311a534..7ded176bb 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java @@ -46,21 +46,17 @@ package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Requires; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.variant.utils.Pair; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.Random; @@ -89,38 +85,38 @@ public class RepeatCovariatesUnitTest { @Test(enabled = true) public void testFindNumberOfRepetitions() { // First, test logic to compute number of repetitions of a substring on a given string. - int result = VariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), true); + int result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), true); Assert.assertEquals(2,result); - result = VariantContextUtils.findNumberofRepetitions("AC".getBytes(),"ACACACAC".getBytes(), true); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); Assert.assertEquals(4,result); - result = VariantContextUtils.findNumberofRepetitions("AC".getBytes(),"ACACACACGT".getBytes(), true); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); Assert.assertEquals(4,result); - result = VariantContextUtils.findNumberofRepetitions("AC".getBytes(),"GTACACACAC".getBytes(), true); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); Assert.assertEquals(0,result); - result = VariantContextUtils.findNumberofRepetitions("GCA".getBytes(),"GTAGGGT".getBytes(), true); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); Assert.assertEquals(0,result); - result = VariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(),"GCAGCAGTAGGGTGTACACACAC".getBytes(), true); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); Assert.assertEquals(1,result); - result = VariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(),"GTAGGGTGTACACACACGCAGCAT".getBytes(), true); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); Assert.assertEquals(0,result); - result = VariantContextUtils.findNumberofRepetitions("GCA".getBytes(),"GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); Assert.assertEquals(0,result); // Same tests but looking backward on string - result = VariantContextUtils.findNumberofRepetitions("AC".getBytes(),"ACAC".getBytes(), false); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), false); Assert.assertEquals(2,result); - result = VariantContextUtils.findNumberofRepetitions("AC".getBytes(),"ACACACAC".getBytes(), false); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); Assert.assertEquals(4,result); - result = VariantContextUtils.findNumberofRepetitions("AC".getBytes(),"ACACACACGT".getBytes(), false); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); Assert.assertEquals(0,result); - result = VariantContextUtils.findNumberofRepetitions("AC".getBytes(),"GTACACACAC".getBytes(), false); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); Assert.assertEquals(4,result); - result = VariantContextUtils.findNumberofRepetitions("GCA".getBytes(),"GTAGGGT".getBytes(), false); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); Assert.assertEquals(0,result); - result = VariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(),"GCAGCAGTAGGGTGTACACACAC".getBytes(), false); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); Assert.assertEquals(0,result); - result = VariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(),"GTAGGGTGTACACACACGCAGCAT".getBytes(), false); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); Assert.assertEquals(0,result); - result = VariantContextUtils.findNumberofRepetitions("GCA".getBytes(),"GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); Assert.assertEquals(3,result); // test logic to get repeat unit and number of repeats from covariate value @@ -208,8 +204,8 @@ public class RepeatCovariatesUnitTest { Assert.assertEquals(rurlValM,rurlValI); - int fw = VariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true); - int bw = VariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false); + int fw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true); + int bw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false); Assert.assertEquals(Math.min(fw+bw,RepeatCovariate.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java index 071bd2cad..5e863f4f7 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java @@ -31,7 +31,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.ApplicationDetails; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java index b7c824360..e078e678b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import java.util.*; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java b/public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java index ea83aab53..d0ba0fa21 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index a77341a5d..09f053187 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import java.util.*; @@ -200,7 +201,7 @@ public class VariantContextAdaptors { if ( isSNP(dbsnp) || isMNP(dbsnp) ) addPaddingBase = false; else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) - addPaddingBase = refBaseIsDash || VariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); + addPaddingBase = refBaseIsDash || GATKVariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); else return null; // can't handle anything else diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index ec51b2f53..c5f87d625 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -40,7 +40,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.file.FSLockWithShared; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 979cc2fbf..522414c00 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -35,7 +35,7 @@ import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.recalibration.BQSRMode; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 29f6ed388..c5a6fd624 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.variant.variantcontext.*; @@ -249,7 +250,7 @@ public class VariantAnnotatorEngine { private VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc, final Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { - final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), loc), vc.getType()); + final String rsID = GATKVCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), loc), vc.getType()); // add the ID if appropriate if ( rsID != null ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 383b22295..dbb8ed5a6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -46,7 +46,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java index 77a4af1cd..668d3fd5f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java @@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.io.PrintStream; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index ac6d82375..8a5b3530e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.variant.variantcontext.VariantContext; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java index d12ad3183..ed3ebe173 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java @@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.io.PrintStream; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java index 82247d160..0423c6f0a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java @@ -36,7 +36,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.io.PrintStream; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java index be1e264c6..a0f943f7e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java @@ -43,7 +43,7 @@ import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.io.PrintStream; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java index 7135bffce..77490be93 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java @@ -35,7 +35,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.util.Collections; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java index 4e05033ec..cabc2f467 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java @@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java index 6b2c0f75c..fe2b75464 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java @@ -45,7 +45,7 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ClippingOp; import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index 9f758706e..e24c725a6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -49,6 +49,7 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.VariantEvalUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -197,7 +198,7 @@ public class VariantEval extends RodWalker implements TreeRedu protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; @Argument(shortName="ploidy", fullName="samplePloidy", doc="Per-sample ploidy (number of chromosomes per sample)", required=false) - protected int ploidy = VariantContextUtils.DEFAULT_PLOIDY; + protected int ploidy = GATKVariantContextUtils.DEFAULT_PLOIDY; @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) private File ancestralAlignmentsFile = null; @@ -285,7 +286,7 @@ public class VariantEval extends RodWalker implements TreeRedu // Now that we have all the rods categorized, determine the sample list from the eval rods. Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), evals); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + Set vcfSamples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); // Load the sample list, using an intermediate tree set to sort the samples final Set allSampleNames = SampleUtils.getSamplesFromCommandLineInput(vcfSamples); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java index 5c16c7385..a63f32485 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.AnalysisModuleScanner; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java index 5ef414b00..de82b18cc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java @@ -27,8 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; import java.util.Arrays; import java.util.List; @@ -51,7 +51,7 @@ public class TandemRepeat extends VariantStratifier { public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if ( eval == null || ! eval.isIndel() ) return ALL; - else if ( VariantContextUtils.isTandemRepeat(eval, ref.getForwardBases()) ) { + else if ( GATKVariantContextUtils.isTandemRepeat(eval, ref.getForwardBases()) ) { print("REPEAT", eval, ref); return REPEAT; } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java index d792e4c67..681d32f2d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java @@ -28,7 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manage import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index c54a57385..d0d6a68a8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -135,14 +136,14 @@ public class CombineVariants extends RodWalker implements Tree protected VariantContextWriter vcfWriter = null; @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) - public VariantContextUtils.GenotypeMergeType genotypeMergeOption = null; + public GATKVariantContextUtils.GenotypeMergeType genotypeMergeOption = null; @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) - public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; + public GATKVariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; @Hidden @Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false) - public VariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = VariantContextUtils.MultipleAllelesMergeType.BY_TYPE; + public GATKVariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE; /** * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. @@ -203,12 +204,12 @@ public class CombineVariants extends RodWalker implements Tree validateAnnotateUnionArguments(); if ( PRIORITY_STRING == null && genotypeMergeOption == null) { - genotypeMergeOption = VariantContextUtils.GenotypeMergeType.UNSORTED; + genotypeMergeOption = GATKVariantContextUtils.GenotypeMergeType.UNSORTED; //PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); Deleted by Ami (7/10/12) logger.info("Priority string is not provided, using arbitrary genotyping order: "+priority); } - if (genotypeMergeOption == VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE && + if (genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE && !SampleUtils.verifyUniqueSamplesNames(vcfRods)) throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered."); @@ -232,7 +233,7 @@ public class CombineVariants extends RodWalker implements Tree private void validateAnnotateUnionArguments() { Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); - if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) + if ( genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); if ( PRIORITY_STRING != null){ @@ -278,7 +279,7 @@ public class CombineVariants extends RodWalker implements Tree List mergedVCs = new ArrayList(); - if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { + if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); // TODO -- clean this up in a refactoring @@ -296,13 +297,13 @@ public class CombineVariants extends RodWalker implements Tree // iterate over the types so that it's deterministic for (VariantContext.Type type : VariantContext.Type.values()) { if (VCsByType.containsKey(type)) - mergedVCs.add(VariantContextUtils.simpleMerge(VCsByType.get(type), - priority, rodNames.size() , filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), + priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } } - else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { - mergedVCs.add(VariantContextUtils.simpleMerge(vcs, + else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { + mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java index 8d5e7c2b8..048c7ef77 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java @@ -32,7 +32,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFHeader; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java index e94a771d3..e4d182d13 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java @@ -39,12 +39,12 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.text.ListFileUtils; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; import java.io.File; import java.util.*; @@ -204,7 +204,7 @@ public class SelectHeaders extends RodWalker implements TreeRe } } - TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); VCFHeader vcfHeader = new VCFHeader(headerLines, vcfSamples); vcfHeader.setWriteEngineHeaders(includeEngineHeaders); vcfWriter.writeHeader(vcfHeader); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index dfe604a7d..4d30408d8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -337,7 +338,7 @@ public class SelectVariants extends RodWalker implements TreeR List rodNames = Arrays.asList(variantCollection.variants.getName()); vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); @@ -661,7 +662,7 @@ public class SelectVariants extends RodWalker implements TreeR // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate) if ( vc.getAlleles().size() != sub.getAlleles().size() ) - newGC = VariantContextUtils.stripPLsAndAD(sub.getGenotypes()); + newGC = GATKVariantContextUtils.stripPLsAndAD(sub.getGenotypes()); // if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags if ( vc.getNSamples() != sub.getNSamples() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index 17d5ab1b1..5bf5b96e3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -35,13 +35,13 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; import java.util.*; @@ -256,7 +256,7 @@ public class VariantValidationAssessor extends RodWalker //if ( popFile != null ) { // throw new StingException("We still need to implement this!"); //} else { - return VariantContextUtils.computeHardyWeinbergPvalue(vc); + return GATKVariantContextUtils.computeHardyWeinbergPvalue(vc); //} } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 91057c812..1ea85df47 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -29,6 +29,7 @@ import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -180,7 +181,7 @@ public class VariantsToTable extends RodWalker { if ( !genotypeFieldsToTake.isEmpty() ) { Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), variants); - TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); samples.addAll(vcfSamples); // optimization: if there are no samples, we don't have to worry about any genotype fields diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 61746cbf1..5afeccffe 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -119,7 +119,7 @@ public class VariantsToVCF extends RodWalker { if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) ) return 0; - String rsID = dbsnp == null ? null : VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP); + String rsID = dbsnp == null ? null : GATKVCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP); Collection contexts = getVariantContexts(tracker, ref); diff --git a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java index 93246fd6f..10fb606f9 100644 --- a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java +++ b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java @@ -36,7 +36,7 @@ import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.variant.bcf2.BCF2Codec; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.vcf.VCFCodec; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.sting.utils.exceptions.UserException; diff --git a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java index 05468c6c2..74009682a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java +++ b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -33,7 +33,7 @@ import org.apache.commons.math.MathException; import org.apache.commons.math.distribution.NormalDistribution; import org.apache.commons.math.distribution.NormalDistributionImpl; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import java.io.Serializable; diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java index b316d1117..e2edf7421 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java @@ -28,7 +28,7 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import java.util.*; diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index f158308b4..b1de89dd8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -29,11 +29,11 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.sting.utils.text.ListFileUtils; import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; import java.io.File; import java.io.FileNotFoundException; @@ -117,15 +117,15 @@ public class SampleUtils { } public static Set getSampleList(Map headers) { - return getSampleList(headers, VariantContextUtils.GenotypeMergeType.PRIORITIZE); + return getSampleList(headers, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE); } - public static Set getSampleList(Map headers, VariantContextUtils.GenotypeMergeType mergeOption) { + public static Set getSampleList(Map headers, GATKVariantContextUtils.GenotypeMergeType mergeOption) { Set samples = new TreeSet(); for ( Map.Entry val : headers.entrySet() ) { VCFHeader header = val.getValue(); for ( String sample : header.getGenotypeSamples() ) { - samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == VariantContextUtils.GenotypeMergeType.UNIQUIFY)); + samples.add(GATKVariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY)); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index 794caa315..8c7bce6ac 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -32,7 +32,7 @@ import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMUtils; import org.apache.log4j.Logger; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ReadUtils; diff --git a/public/java/src/org/broadinstitute/variant/utils/Pair.java b/public/java/src/org/broadinstitute/sting/utils/collections/Pair.java similarity index 98% rename from public/java/src/org/broadinstitute/variant/utils/Pair.java rename to public/java/src/org/broadinstitute/sting/utils/collections/Pair.java index 858d5fbd7..4c00331a9 100644 --- a/public/java/src/org/broadinstitute/variant/utils/Pair.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/Pair.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.variant.utils; +package org.broadinstitute.sting.utils.collections; public class Pair { diff --git a/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java b/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java index 1072fb1b7..39f5b06c6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java @@ -29,7 +29,7 @@ import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 00dd037db..76ccede62 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -30,7 +30,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java index 099554a2c..bb0dc670b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -37,7 +37,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 403e166c5..7374dda14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -39,7 +39,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 46eceefd5..29f8c8dcd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -32,7 +32,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.NGSPlatform; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.variant.utils.BaseUtils; diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java index b2069c7ee..cbc7c01ed 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java @@ -31,7 +31,7 @@ import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.vcf.*; @@ -147,4 +147,49 @@ public class GATKVCFUtils { return VCFUtils.withUpdatedContigs(header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary()); } + public static String rsIDOfFirstRealVariant(List VCs, VariantContext.Type type) { + if ( VCs == null ) + return null; + + String rsID = null; + for ( VariantContext vc : VCs ) { + if ( vc.getType() == type ) { + rsID = vc.getID(); + break; + } + } + + return rsID; + } + + /** + * Read all of the VCF records from source into memory, returning the header and the VariantContexts + * + * SHOULD ONLY BE USED FOR UNIT/INTEGRATION TESTING PURPOSES! + * + * @param source the file to read, must be in VCF4 format + * @return + * @throws java.io.IOException + */ + public static Pair> readVCF(final File source) throws IOException { + // read in the features + final List vcs = new ArrayList(); + final VCFCodec codec = new VCFCodec(); + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + FeatureCodecHeader header = codec.readHeader(pbs); + pbs.close(); + + pbs = new PositionalBufferedStream(new FileInputStream(source)); + pbs.skip(header.getHeaderEnd()); + + final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); + + while ( ! pbs.isDone() ) { + final VariantContext vc = codec.decode(pbs); + if ( vc != null ) + vcs.add(vc); + } + + return new Pair>(vcfHeader, vcs); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 82241ad55..2ae289214 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -25,12 +25,79 @@ package org.broadinstitute.sting.utils.variant; +import com.google.java.contract.Requires; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broad.tribble.TribbleException; +import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; + +import java.io.Serializable; +import java.util.*; public class GATKVariantContextUtils { + private static Logger logger = Logger.getLogger(GATKVariantContextUtils.class); + + public static final int DEFAULT_PLOIDY = 2; + public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + public final static String MERGE_FILTER_PREFIX = "filterIn"; + public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; + public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; + public final static String MERGE_INTERSECTION = "Intersection"; + + public enum GenotypeMergeType { + /** + * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. + */ + UNIQUIFY, + /** + * Take genotypes in priority order (see the priority argument). + */ + PRIORITIZE, + /** + * Take the genotypes in any order. + */ + UNSORTED, + /** + * Require that all samples/genotypes be unique between all inputs. + */ + REQUIRE_UNIQUE + } + + public enum FilteredRecordMergeType { + /** + * Union - leaves the record if any record is unfiltered. + */ + KEEP_IF_ANY_UNFILTERED, + /** + * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. + */ + KEEP_IF_ALL_UNFILTERED, + /** + * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. + */ + KEEP_UNCONDITIONAL + } + + public enum MultipleAllelesMergeType { + /** + * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. + */ + BY_TYPE, + /** + * Merge all allele types at the same start position into the same VCF record. + */ + MIX_TYPES + } + /** * create a genome location, given a variant context * @param genomeLocParser parser @@ -41,4 +108,885 @@ public class GATKVariantContextUtils { return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); } + /** + * Returns true iff VC is an non-complex indel where every allele represents an expansion or + * contraction of a series of identical bases in the reference. + * + * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT + * + * If VC = -/CT, then this function returns true because the CT insertion matches exactly the + * upcoming reference. + * If VC = -/CTA then this function returns false because the CTA isn't a perfect match + * + * Now consider deletions: + * + * If VC = CT/- then again the same logic applies and this returns true + * The case of CTA/- makes no sense because it doesn't actually match the reference bases. + * + * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For + * each insertion allele of n bases, check if that allele matches the next n reference bases. + * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, + * as it must necessarily match the first n bases. If this test returns true for all + * alleles you are a tandem repeat, otherwise you are not. + * + * @param vc + * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return false; + + final Allele ref = vc.getReference(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) + return false; + } + + // we've passed all of the tests, so we are a repeat + return true; + } + + /** + * + * @param vc + * @param refBasesStartingAtVCWithPad + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final boolean VERBOSE = false; + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return null; + + final Allele refAllele = vc.getReference(); + final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); + + byte[] repeatUnit = null; + final ArrayList lengths = new ArrayList(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); + + final int[] repetitionCount = result.first; + // repetition count = 0 means allele is not a tandem expansion of context + if (repetitionCount[0] == 0 || repetitionCount[1] == 0) + return null; + + if (lengths.size() == 0) { + lengths.add(repetitionCount[0]); // add ref allele length only once + } + lengths.add(repetitionCount[1]); // add this alt allele's length + + repeatUnit = result.second; + if (VERBOSE) { + System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); + System.out.println("Ref:"+refAllele.toString()+" Count:" + String.valueOf(repetitionCount[0])); + System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); + System.out.println("RU:"+new String(repeatUnit)); + } + } + + return new Pair, byte[]>(lengths,repeatUnit); + } + + public static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { + /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. + Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. + */ + + byte[] longB; + // find first repeat unit based on either ref or alt, whichever is longer + if (altBases.length > refBases.length) + longB = altBases; + else + longB = refBases; + + // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units + // for example, -*,CACA needs to first be decomposed into (CA)2 + final int repeatUnitLength = findRepeatedSubstring(longB); + final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); + + final int[] repetitionCount = new int[2]; + // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) + int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases, true); + repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; + repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; + + return new Pair(repetitionCount, repeatUnit); + + } + + /** + * Find out if a string can be represented as a tandem number of substrings. + * For example ACTACT is a 2-tandem of ACT, + * but ACTACA is not. + * + * @param bases String to be tested + * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't + * be represented as one, it will be just the length of the input string) + */ + public static int findRepeatedSubstring(byte[] bases) { + + int repLength; + for (repLength=1; repLength <=bases.length; repLength++) { + final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); + boolean allBasesMatch = true; + for (int start = repLength; start < bases.length; start += repLength ) { + // check that remaining of string is exactly equal to repeat unit + final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); + if (!Arrays.equals(candidateRepeatUnit, basePiece)) { + allBasesMatch = false; + break; + } + } + if (allBasesMatch) + return repLength; + } + + return repLength; + } + + /** + * Helper routine that finds number of repetitions a string consists of. + * For example, for string ATAT and repeat unit AT, number of repetitions = 2 + * @param repeatUnit Substring + * @param testString String to test + * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) + * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's + */ + public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { + int numRepeats = 0; + if (lookForward) { + // look forward on the test string + for (int start = 0; start < testString.length; start += repeatUnit.length) { + int end = start + repeatUnit.length; + byte[] unit = Arrays.copyOfRange(testString,start, end); + if(Arrays.equals(unit,repeatUnit)) + numRepeats++; + else + break; + } + return numRepeats; + } + + // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 + // look forward on the test string + for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { + int end = start + repeatUnit.length; + byte[] unit = Arrays.copyOfRange(testString,start, end); + if(Arrays.equals(unit,repeatUnit)) + numRepeats++; + else + break; + } + return numRepeats; + } + + /** + * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference + * @param ref + * @param alt + * @param refBasesStartingAtVCWithoutPad + * @return + */ + protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { + if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) + return false; // we require one allele be a prefix of another + + if ( ref.length() > alt.length() ) { // we are a deletion + return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); + } else { // we are an insertion + return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); + } + } + + protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { + final String potentialRepeat = l.substring(s.length()); // skip s bases + + for ( int i = 0; i < minNumberOfMatches; i++) { + final int start = i * potentialRepeat.length(); + final int end = (i+1) * potentialRepeat.length(); + if ( ref.length() < end ) + return false; // we ran out of bases to test + final String refSub = ref.substring(start, end); + if ( ! refSub.equals(potentialRepeat) ) + return false; // repeat didn't match, fail + } + + return true; // we passed all tests, we matched + } + + /** + * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) + * + * @param vc variant context with genotype likelihoods + * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** + * @param assignGenotypes true if we should update the genotypes based on the (subsetted) PLs + * @return genotypes + */ + public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes) { + + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + + // samples + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + + // which PLs should be carried forward? + ArrayList likelihoodIndexesToUse = null; + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { + likelihoodIndexesToUse = new ArrayList(30); + + final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) + altAlleleIndexToUse[i] = true; + } + + // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(1 + numOriginalAltAlleles, DEFAULT_PLOIDY); + for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + // consider this entry only if both of the alleles are good + if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) ) + likelihoodIndexesToUse.add(PLindex); + } + } + + // create the new genotypes + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + if ( !g.hasLikelihoods() ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); + continue; + } + + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + double[] newLikelihoods; + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // if there is no mass on the (new) likelihoods, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); + } + else { + final GenotypeBuilder gb = new GenotypeBuilder(g); + + if ( numNewAltAlleles == 0 ) + gb.noPL(); + else + gb.PL(newLikelihoods); + + // if we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { + gb.alleles(NO_CALL_ALLELES); + } + else { + // find the genotype with maximum likelihoods + int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); + if ( numNewAltAlleles != 0 ) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); + } + newGTs.add(gb.make()); + } + } + + return newGTs; + } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param vc variant context with genotype likelihoods + * @return genotypes context + */ + public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { + return subsetDiploidAlleles(vc, vc.getAlleles(), true); + } + + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc) { + if ( ! vc.isVariant() || vc.isBiallelic() ) + // non variant or biallelics already satisfy the contract + return Collections.singletonList(vc); + else { + final List biallelics = new LinkedList(); + + for ( final Allele alt : vc.getAlternateAlleles() ) { + VariantContextBuilder builder = new VariantContextBuilder(vc); + final List alleles = Arrays.asList(vc.getReference(), alt); + builder.alleles(alleles); + builder.genotypes(subsetDiploidAlleles(vc, alleles, false)); + VariantContextUtils.calculateChromosomeCounts(builder, true); + biallelics.add(reverseTrimAlleles(builder.make())); + } + + return biallelics; + } + } + + public static Genotype removePLsAndAD(final Genotype g) { + return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; + } + + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name + * + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final Collection unsortedVCs, + final List priorityListOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); + return simpleMerge(unsortedVCs,priorityListOfVCs,originalNumOfVCs,filteredRecordMergeType,genotypeMergeOptions,annotateOrigin,printMessages,setKey,filteredAreUncalled,mergeInfoWithMaxAC); + } + + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name. + * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use + * SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge. + * + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final Collection unsortedVCs, + final List priorityListOfVCs, + final int originalNumOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + + if ( unsortedVCs == null || unsortedVCs.size() == 0 ) + return null; + + if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) + throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); + + if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) + throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); + + final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); + // Make sure all variant contexts are padded with reference base in case of indels if necessary + final List VCs = new ArrayList(); + + for (final VariantContext vc : preFilteredVCs) { + if ( ! filteredAreUncalled || vc.isNotFiltered() ) + VCs.add(vc); + } + if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled + return null; + + // establish the baseline info from the first VC + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + final Allele refAllele = determineReferenceAllele(VCs); + + final Set alleles = new LinkedHashSet(); + final Set filters = new HashSet(); + final Map attributes = new LinkedHashMap(); + final Set inconsistentAttributes = new HashSet(); + final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant + final Set rsIDs = new LinkedHashSet(1); // most of the time there's one id + + VariantContext longestVC = first; + int depth = 0; + int maxAC = -1; + final Map attributesWithMaxAC = new LinkedHashMap(); + double log10PError = CommonInfo.NO_LOG10_PERROR; + VariantContext vcWithMaxAC = null; + GenotypesContext genotypes = GenotypesContext.create(); + + // counting the number of filtered and variant VCs + int nFiltered = 0; + + boolean remapped = false; + + // cycle through and add info from the other VCs, making sure the loc/reference matches + + for ( final VariantContext vc : VCs ) { + if ( longestVC.getStart() != vc.getStart() ) + throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); + + if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) + longestVC = vc; // get the longest location + + nFiltered += vc.isFiltered() ? 1 : 0; + if ( vc.isVariant() ) variantSources.add(vc.getSource()); + + AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); + remapped = remapped || alleleMapping.needsRemapping(); + + alleles.addAll(alleleMapping.values()); + + mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); + + // We always take the QUAL of the first VC with a non-MISSING qual for the combined value + if ( log10PError == CommonInfo.NO_LOG10_PERROR ) + log10PError = vc.getLog10PError(); + + filters.addAll(vc.getFilters()); + + // + // add attributes + // + // special case DP (add it up) and ID (just preserve it) + // + if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + if ( vc.hasID() ) rsIDs.add(vc.getID()); + if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { + String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); + // lets see if the string contains a , separator + if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { + List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); + for (String alleleCount : alleleCountArray) { + final int ac = Integer.valueOf(alleleCount.trim()); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } else { + final int ac = Integer.valueOf(rawAlleleCounts); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } + + for (final Map.Entry p : vc.getAttributes().entrySet()) { + String key = p.getKey(); + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + + if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); + attributes.put(key, p.getValue()); + } + } + } + } + + // if we have more alternate alleles in the merged VC than in one or more of the + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD + for ( final VariantContext vc : VCs ) { + if (vc.getAlleles().size() == 1) + continue; + if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { + if ( ! genotypes.isEmpty() ) { + logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", + vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); + } + genotypes = stripPLsAndAD(genotypes); + // this will remove stale AC,AF attributed from vc + VariantContextUtils.calculateChromosomeCounts(vc, attributes, true); + break; + } + } + + // take the VC with the maxAC and pull the attributes into a modifiable map + if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { + attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); + } + + // if at least one record was unfiltered and we want a union, clear all of the filters + if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) + filters.clear(); + + + if ( annotateOrigin ) { // we care about where the call came from + String setValue; + if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered + setValue = MERGE_INTERSECTION; + else if ( nFiltered == VCs.size() ) // everything was filtered out + setValue = MERGE_FILTER_IN_ALL; + else if ( variantSources.isEmpty() ) // everyone was reference + setValue = MERGE_REF_IN_ALL; + else { + final LinkedHashSet s = new LinkedHashSet(); + for ( final VariantContext vc : VCs ) + if ( vc.isVariant() ) + s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); + setValue = Utils.join("-", s); + } + + if ( setKey != null ) { + attributes.put(setKey, setValue); + if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { + attributesWithMaxAC.put(setKey, setValue); + } + } + } + + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); + builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); + builder.alleles(alleles); + builder.genotypes(genotypes); + builder.log10PError(log10PError); + builder.filters(filters.isEmpty() ? filters : new TreeSet(filters)); + builder.attributes(new TreeMap(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + + // Trim the padded bases of all alleles if necessary + final VariantContext merged = builder.make(); + if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); + return merged; + } + + private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { + final Iterator it1 = alleleSet1.iterator(); + final Iterator it2 = alleleSet2.iterator(); + + while ( it1.hasNext() && it2.hasNext() ) { + final Allele a1 = it1.next(); + final Allele a2 = it2.next(); + if ( ! a1.equals(a2) ) + return true; + } + + // by this point, at least one of the iterators is empty. All of the elements + // we've compared are equal up until this point. But it's possible that the + // sets aren't the same size, which is indicated by the test below. If they + // are of the same size, though, the sets are compatible + return it1.hasNext() || it2.hasNext(); + } + + public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { + GenotypesContext newGs = GenotypesContext.create(genotypes.size()); + + for ( final Genotype g : genotypes ) { + newGs.add(removePLsAndAD(g)); + } + + return newGs; + } + + static private Allele determineReferenceAllele(List VCs) { + Allele ref = null; + + for ( VariantContext vc : VCs ) { + Allele myRef = vc.getReference(); + if ( ref == null || ref.length() < myRef.length() ) + ref = myRef; + else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) + throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); + } + + return ref; + } + + static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { + if ( refAllele.equals(vc.getReference()) ) + return new AlleleMapper(vc); + else { + // we really need to do some work. The refAllele is the longest reference allele seen at this + // start site. So imagine it is: + // + // refAllele: ACGTGA + // myRef: ACGT + // myAlt: A + // + // We need to remap all of the alleles in vc to include the extra GA so that + // myRef => refAllele and myAlt => AGA + // + + Allele myRef = vc.getReference(); + if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); + byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); + +// System.out.printf("Remapping allele at %s%n", vc); +// System.out.printf("ref %s%n", refAllele); +// System.out.printf("myref %s%n", myRef ); +// System.out.printf("extrabases %s%n", new String(extraBases)); + + Map map = new HashMap(); + for ( Allele a : vc.getAlleles() ) { + if ( a.isReference() ) + map.put(a, refAllele); + else { + Allele extended = Allele.extend(a, extraBases); + for ( Allele b : allAlleles ) + if ( extended.equals(b) ) + extended = b; +// System.out.printf(" Extending %s => %s%n", a, extended); + map.put(a, extended); + } + } + + // debugging +// System.out.printf("mapping %s%n", map); + + return new AlleleMapper(map); + } + } + + public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { + if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) + throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); + + if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) + return new ArrayList(unsortedVCs); + else { + ArrayList sorted = new ArrayList(unsortedVCs); + Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); + return sorted; + } + } + + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { + //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE + for ( Genotype g : oneVC.getGenotypes() ) { + String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); + if ( ! mergedGenotypes.containsSample(name) ) { + // only add if the name is new + Genotype newG = g; + + if ( uniqifySamples || alleleMapping.needsRemapping() ) { + final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); + newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); + } + + mergedGenotypes.add(newG); + } + } + } + + public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) { + return uniqify ? sampleName + "." + trackName : sampleName; + } + + public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { + + // see whether we need to trim common reference base from all alleles + final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false); + if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) + return inputVC; + + final List alleles = new ArrayList(); + final GenotypesContext genotypes = GenotypesContext.create(); + final Map originalToTrimmedAlleleMap = new HashMap(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); + final Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + for ( final Genotype genotype : inputVC.getGenotypes() ) { + final List originalAlleles = genotype.getAlleles(); + final List trimmedAlleles = new ArrayList(); + for ( final Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make()); + } + + return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() - 1).alleles(alleles).genotypes(genotypes).make(); + } + + public static int computeReverseClipping(final List unclippedAlleles, + final byte[] ref, + final int forwardClipping, + final boolean allowFullClip) { + int clipping = 0; + boolean stillClipping = true; + + while ( stillClipping ) { + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - (allowFullClip ? 0 : 1); + + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { + stillClipping = false; + } + else if ( ref.length == clipping ) { + if ( allowFullClip ) + stillClipping = false; + else + return -1; + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { + stillClipping = false; + } + } + if ( stillClipping ) + clipping++; + } + + return clipping; + } + + public static double computeHardyWeinbergPvalue(VariantContext vc) { + if ( vc.getCalledChrCount() == 0 ) + return 0.0; + return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); + } + + public static boolean requiresPaddingBase(final List alleles) { + + // see whether one of the alleles would be null if trimmed through + + for ( final String allele : alleles ) { + if ( allele.isEmpty() ) + return true; + } + + int clipping = 0; + Character currentBase = null; + + while ( true ) { + for ( final String allele : alleles ) { + if ( allele.length() - clipping == 0 ) + return true; + + char myBase = allele.charAt(clipping); + if ( currentBase == null ) + currentBase = myBase; + else if ( currentBase != myBase ) + return false; + } + + clipping++; + currentBase = null; + } + } + + private static class AlleleMapper { + private VariantContext vc = null; + private Map map = null; + public AlleleMapper(VariantContext vc) { this.vc = vc; } + public AlleleMapper(Map map) { this.map = map; } + public boolean needsRemapping() { return this.map != null; } + public Collection values() { return map != null ? map.values() : vc.getAlleles(); } + public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } + + public List remap(List as) { + List newAs = new ArrayList(); + for ( Allele a : as ) { + //System.out.printf(" Remapping %s => %s%n", a, remap(a)); + newAs.add(remap(a)); + } + return newAs; + } + } + + private static class CompareByPriority implements Comparator, Serializable { + List priorityListOfVCs; + public CompareByPriority(List priorityListOfVCs) { + this.priorityListOfVCs = priorityListOfVCs; + } + + private int getIndex(VariantContext vc) { + int i = priorityListOfVCs.indexOf(vc.getSource()); + if ( i == -1 ) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); + return i; + } + + public int compare(VariantContext vc1, VariantContext vc2) { + return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); + } + } } diff --git a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java index bbd02e0a6..76b49edb9 100644 --- a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java @@ -26,8 +26,6 @@ package org.broadinstitute.variant.utils; import net.sf.samtools.util.StringUtil; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.Random; @@ -176,7 +174,7 @@ public class BaseUtils { if ( baseIndex == Base.N.ordinal() ) { bases[i] = 'N'; } else if ( errorOnBadReferenceBase && baseIndex == -1 ) { - throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); + throw new IllegalStateException("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); } } return bases; @@ -517,7 +515,7 @@ public class BaseUtils { case 'N': return 'N'; default: - throw new ReviewedStingException("base must be A, C, G or T. " + (char) base + " is not a valid base."); + throw new IllegalArgumentException("base must be A, C, G or T. " + (char) base + " is not a valid base."); } } } diff --git a/public/java/src/org/broadinstitute/variant/utils/GeneralUtils.java b/public/java/src/org/broadinstitute/variant/utils/GeneralUtils.java index 79014a0eb..2dbc865b5 100644 --- a/public/java/src/org/broadinstitute/variant/utils/GeneralUtils.java +++ b/public/java/src/org/broadinstitute/variant/utils/GeneralUtils.java @@ -141,13 +141,6 @@ public class GeneralUtils { return normalized; } - public static double sum(double[] values) { - double s = 0.0; - for (double v : values) - s += v; - return s; - } - public static double arrayMax(final double[] array) { return array[maxElementIndex(array, array.length)]; } diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/CommonInfo.java b/public/java/src/org/broadinstitute/variant/variantcontext/CommonInfo.java index fd3227dbf..16fa52ee0 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/CommonInfo.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/CommonInfo.java @@ -36,7 +36,7 @@ import java.util.*; * * @author depristo */ -final class CommonInfo { +public final class CommonInfo { public static final double NO_LOG10_PERROR = 1.0; private static Set NO_FILTERS = Collections.emptySet(); diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java index a6378951e..fa2b5c9e5 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java @@ -29,27 +29,16 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; -import org.apache.commons.lang.ArrayUtils; import org.broad.tribble.TribbleException; -import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.utils.Pair; import org.broadinstitute.variant.vcf.*; -import java.io.Serializable; import java.util.*; public class VariantContextUtils { - public final static String MERGE_INTERSECTION = "Intersection"; - public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; - public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; - public final static String MERGE_FILTER_PREFIX = "filterIn"; - public static final int DEFAULT_PLOIDY = 2; - public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); - private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); final public static JexlEngine engine = new JexlEngine(); private final static boolean ASSUME_MISSING_FIELDS_ARE_STRINGS = false; @@ -158,10 +147,6 @@ public class VariantContextUtils { builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, founderIds)); } - public static Genotype removePLsAndAD(final Genotype g) { - return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; - } - public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) { VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field); if ( metaData == null ) metaData = header.getInfoHeaderLine(field); @@ -180,443 +165,6 @@ public class VariantContextUtils { return metaData; } - /** - * Returns true iff VC is an non-complex indel where every allele represents an expansion or - * contraction of a series of identical bases in the reference. - * - * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT - * - * If VC = -/CT, then this function returns true because the CT insertion matches exactly the - * upcoming reference. - * If VC = -/CTA then this function returns false because the CTA isn't a perfect match - * - * Now consider deletions: - * - * If VC = CT/- then again the same logic applies and this returns true - * The case of CTA/- makes no sense because it doesn't actually match the reference bases. - * - * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For - * each insertion allele of n bases, check if that allele matches the next n reference bases. - * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, - * as it must necessarily match the first n bases. If this test returns true for all - * alleles you are a tandem repeat, otherwise you are not. - * - * @param vc - * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference - * @return - */ - @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) - public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { - final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); - if ( ! vc.isIndel() ) // only indels are tandem repeats - return false; - - final Allele ref = vc.getReference(); - - for ( final Allele allele : vc.getAlternateAlleles() ) { - if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) - return false; - } - - // we've passed all of the tests, so we are a repeat - return true; - } - - /** - * - * @param vc - * @param refBasesStartingAtVCWithPad - * @return - */ - @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) - public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { - final boolean VERBOSE = false; - final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); - if ( ! vc.isIndel() ) // only indels are tandem repeats - return null; - - final Allele refAllele = vc.getReference(); - final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); - - byte[] repeatUnit = null; - final ArrayList lengths = new ArrayList(); - - for ( final Allele allele : vc.getAlternateAlleles() ) { - Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); - - final int[] repetitionCount = result.first; - // repetition count = 0 means allele is not a tandem expansion of context - if (repetitionCount[0] == 0 || repetitionCount[1] == 0) - return null; - - if (lengths.size() == 0) { - lengths.add(repetitionCount[0]); // add ref allele length only once - } - lengths.add(repetitionCount[1]); // add this alt allele's length - - repeatUnit = result.second; - if (VERBOSE) { - System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); - System.out.println("Ref:"+refAllele.toString()+" Count:" + String.valueOf(repetitionCount[0])); - System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); - System.out.println("RU:"+new String(repeatUnit)); - } - } - - return new Pair, byte[]>(lengths,repeatUnit); - } - - public static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { - /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. - Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. - */ - - byte[] longB; - // find first repeat unit based on either ref or alt, whichever is longer - if (altBases.length > refBases.length) - longB = altBases; - else - longB = refBases; - - // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units - // for example, -*,CACA needs to first be decomposed into (CA)2 - final int repeatUnitLength = findRepeatedSubstring(longB); - final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); - - final int[] repetitionCount = new int[2]; - // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) - int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases, true); - repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; - repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; - - return new Pair(repetitionCount, repeatUnit); - - } - - /** - * Find out if a string can be represented as a tandem number of substrings. - * For example ACTACT is a 2-tandem of ACT, - * but ACTACA is not. - * - * @param bases String to be tested - * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't - * be represented as one, it will be just the length of the input string) - */ - public static int findRepeatedSubstring(byte[] bases) { - - int repLength; - for (repLength=1; repLength <=bases.length; repLength++) { - final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); - boolean allBasesMatch = true; - for (int start = repLength; start < bases.length; start += repLength ) { - // check that remaining of string is exactly equal to repeat unit - final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); - if (!Arrays.equals(candidateRepeatUnit, basePiece)) { - allBasesMatch = false; - break; - } - } - if (allBasesMatch) - return repLength; - } - - return repLength; - } - - /** - * Helper routine that finds number of repetitions a string consists of. - * For example, for string ATAT and repeat unit AT, number of repetitions = 2 - * @param repeatUnit Substring - * @param testString String to test - * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) - * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's - */ - public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { - int numRepeats = 0; - if (lookForward) { - // look forward on the test string - for (int start = 0; start < testString.length; start += repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); - if(Arrays.equals(unit,repeatUnit)) - numRepeats++; - else - break; - } - return numRepeats; - } - - // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 - // look forward on the test string - for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); - if(Arrays.equals(unit,repeatUnit)) - numRepeats++; - else - break; - } - return numRepeats; - } - - /** - * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference - * @param ref - * @param alt - * @param refBasesStartingAtVCWithoutPad - * @return - */ - protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { - if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) - return false; // we require one allele be a prefix of another - - if ( ref.length() > alt.length() ) { // we are a deletion - return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); - } else { // we are an insertion - return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); - } - } - - protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { - final String potentialRepeat = l.substring(s.length()); // skip s bases - - for ( int i = 0; i < minNumberOfMatches; i++) { - final int start = i * potentialRepeat.length(); - final int end = (i+1) * potentialRepeat.length(); - if ( ref.length() < end ) - return false; // we ran out of bases to test - final String refSub = ref.substring(start, end); - if ( ! refSub.equals(potentialRepeat) ) - return false; // repeat didn't match, fail - } - - return true; // we passed all tests, we matched - } - - /** - * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs - * - * @param vc variant context with genotype likelihoods - * @return genotypes context - */ - public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { - return subsetDiploidAlleles(vc, vc.getAlleles(), true); - } - - /** - * Split variant context into its biallelic components if there are more than 2 alleles - * - * For VC has A/B/C alleles, returns A/B and A/C contexts. - * Genotypes are all no-calls now (it's not possible to fix them easily) - * Alleles are right trimmed to satisfy VCF conventions - * - * If vc is biallelic or non-variant it is just returned - * - * Chromosome counts are updated (but they are by definition 0) - * - * @param vc a potentially multi-allelic variant context - * @return a list of bi-allelic (or monomorphic) variant context - */ - public static List splitVariantContextToBiallelics(final VariantContext vc) { - if ( ! vc.isVariant() || vc.isBiallelic() ) - // non variant or biallelics already satisfy the contract - return Collections.singletonList(vc); - else { - final List biallelics = new LinkedList(); - - for ( final Allele alt : vc.getAlternateAlleles() ) { - VariantContextBuilder builder = new VariantContextBuilder(vc); - final List alleles = Arrays.asList(vc.getReference(), alt); - builder.alleles(alleles); - builder.genotypes(subsetDiploidAlleles(vc, alleles, false)); - calculateChromosomeCounts(builder, true); - biallelics.add(reverseTrimAlleles(builder.make())); - } - - return biallelics; - } - } - - /** - * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) - * - * @param vc variant context with genotype likelihoods - * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** - * @param assignGenotypes true if we should update the genotypes based on the (subsetted) PLs - * @return genotypes - */ - public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes) { - - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - - // samples - final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final int numNewAltAlleles = allelesToUse.size() - 1; - - // which PLs should be carried forward? - ArrayList likelihoodIndexesToUse = null; - - // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, - // then we can keep the PLs as is; otherwise, we determine which ones to keep - if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { - likelihoodIndexesToUse = new ArrayList(30); - - final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) - altAlleleIndexToUse[i] = true; - } - - // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(1 + numOriginalAltAlleles, DEFAULT_PLOIDY); - for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - // consider this entry only if both of the alleles are good - if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) ) - likelihoodIndexesToUse.add(PLindex); - } - } - - // create the new genotypes - for ( int k = 0; k < oldGTs.size(); k++ ) { - final Genotype g = oldGTs.get(sampleIndices.get(k)); - if ( !g.hasLikelihoods() ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); - continue; - } - - // create the new likelihoods array from the alleles we are allowed to use - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - double[] newLikelihoods; - if ( likelihoodIndexesToUse == null ) { - newLikelihoods = originalLikelihoods; - } else { - newLikelihoods = new double[likelihoodIndexesToUse.size()]; - int newIndex = 0; - for ( int oldIndex : likelihoodIndexesToUse ) - newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; - - // might need to re-normalize - newLikelihoods = GeneralUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( GeneralUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); - } - else { - final GenotypeBuilder gb = new GenotypeBuilder(g); - - if ( numNewAltAlleles == 0 ) - gb.noPL(); - else - gb.PL(newLikelihoods); - - // if we weren't asked to assign a genotype, then just no-call the sample - if ( !assignGenotypes || GeneralUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { - gb.alleles(NO_CALL_ALLELES); - } - else { - // find the genotype with maximum likelihoods - int PLindex = numNewAltAlleles == 0 ? 0 : GeneralUtils.maxElementIndex(newLikelihoods); - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - - gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); - if ( numNewAltAlleles != 0 ) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); - } - newGTs.add(gb.make()); - } - } - - return newGTs; - } - - public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { - - // see whether we need to trim common reference base from all alleles - final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false); - if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) - return inputVC; - - final List alleles = new ArrayList(); - final GenotypesContext genotypes = GenotypesContext.create(); - final Map originalToTrimmedAlleleMap = new HashMap(); - - for (final Allele a : inputVC.getAlleles()) { - if (a.isSymbolic()) { - alleles.add(a); - originalToTrimmedAlleleMap.put(a, a); - } else { - // get bases for current allele and create a new one with trimmed bases - final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); - final Allele trimmedAllele = Allele.create(newBases, a.isReference()); - alleles.add(trimmedAllele); - originalToTrimmedAlleleMap.put(a, trimmedAllele); - } - } - - // now we can recreate new genotypes with trimmed alleles - for ( final Genotype genotype : inputVC.getGenotypes() ) { - final List originalAlleles = genotype.getAlleles(); - final List trimmedAlleles = new ArrayList(); - for ( final Allele a : originalAlleles ) { - if ( a.isCalled() ) - trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); - else - trimmedAlleles.add(Allele.NO_CALL); - } - genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make()); - } - - return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() - 1).alleles(alleles).genotypes(genotypes).make(); - } - - public static int computeReverseClipping(final List unclippedAlleles, - final byte[] ref, - final int forwardClipping, - final boolean allowFullClip) { - int clipping = 0; - boolean stillClipping = true; - - while ( stillClipping ) { - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - continue; - - // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong - // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). - if ( a.length() - clipping == 0 ) - return clipping - (allowFullClip ? 0 : 1); - - if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { - stillClipping = false; - } - else if ( ref.length == clipping ) { - if ( allowFullClip ) - stillClipping = false; - else - return -1; - } - else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { - stillClipping = false; - } - } - if ( stillClipping ) - clipping++; - } - - return clipping; - } - /** * A simple but common wrapper for matching VariantContext objects using JEXL expressions */ @@ -744,12 +292,6 @@ public class VariantContextUtils { return new JEXLMap(exps,vc,g); } - public static double computeHardyWeinbergPvalue(VariantContext vc) { - if ( vc.getCalledChrCount() == 0 ) - return 0.0; - return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); - } - /** * Returns a newly allocated VC that is the same as VC, but without genotypes * @param vc variant context @@ -814,317 +356,6 @@ public class VariantContextUtils { return builder.genotypes(genotypes).attributes(attributes); } - public enum GenotypeMergeType { - /** - * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. - */ - UNIQUIFY, - /** - * Take genotypes in priority order (see the priority argument). - */ - PRIORITIZE, - /** - * Take the genotypes in any order. - */ - UNSORTED, - /** - * Require that all samples/genotypes be unique between all inputs. - */ - REQUIRE_UNIQUE - } - - public enum FilteredRecordMergeType { - /** - * Union - leaves the record if any record is unfiltered. - */ - KEEP_IF_ANY_UNFILTERED, - /** - * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. - */ - KEEP_IF_ALL_UNFILTERED, - /** - * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. - */ - KEEP_UNCONDITIONAL - } - - public enum MultipleAllelesMergeType { - /** - * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. - */ - BY_TYPE, - /** - * Merge all allele types at the same start position into the same VCF record. - */ - MIX_TYPES - } - - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name - * - * @param unsortedVCs collection of unsorted VCs - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param setKey the key name of the set - * @param filteredAreUncalled are filtered records uncalled? - * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @return new VariantContext representing the merge of unsortedVCs - */ - public static VariantContext simpleMerge(final Collection unsortedVCs, - final List priorityListOfVCs, - final FilteredRecordMergeType filteredRecordMergeType, - final GenotypeMergeType genotypeMergeOptions, - final boolean annotateOrigin, - final boolean printMessages, - final String setKey, - final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { - int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); - return simpleMerge(unsortedVCs,priorityListOfVCs,originalNumOfVCs,filteredRecordMergeType,genotypeMergeOptions,annotateOrigin,printMessages,setKey,filteredAreUncalled,mergeInfoWithMaxAC); - } - - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name. - * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use - * SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge. - * - * @param unsortedVCs collection of unsorted VCs - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param setKey the key name of the set - * @param filteredAreUncalled are filtered records uncalled? - * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @return new VariantContext representing the merge of unsortedVCs - */ - public static VariantContext simpleMerge(final Collection unsortedVCs, - final List priorityListOfVCs, - final int originalNumOfVCs, - final FilteredRecordMergeType filteredRecordMergeType, - final GenotypeMergeType genotypeMergeOptions, - final boolean annotateOrigin, - final boolean printMessages, - final String setKey, - final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { - - if ( unsortedVCs == null || unsortedVCs.size() == 0 ) - return null; - - if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) - throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); - - if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) - throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); - - final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); - // Make sure all variant contexts are padded with reference base in case of indels if necessary - final List VCs = new ArrayList(); - - for (final VariantContext vc : preFilteredVCs) { - if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(vc); - } - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled - return null; - - // establish the baseline info from the first VC - final VariantContext first = VCs.get(0); - final String name = first.getSource(); - final Allele refAllele = determineReferenceAllele(VCs); - - final Set alleles = new LinkedHashSet(); - final Set filters = new HashSet(); - final Map attributes = new LinkedHashMap(); - final Set inconsistentAttributes = new HashSet(); - final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant - final Set rsIDs = new LinkedHashSet(1); // most of the time there's one id - - VariantContext longestVC = first; - int depth = 0; - int maxAC = -1; - final Map attributesWithMaxAC = new LinkedHashMap(); - double log10PError = CommonInfo.NO_LOG10_PERROR; - VariantContext vcWithMaxAC = null; - GenotypesContext genotypes = GenotypesContext.create(); - - // counting the number of filtered and variant VCs - int nFiltered = 0; - - boolean remapped = false; - - // cycle through and add info from the other VCs, making sure the loc/reference matches - - for ( final VariantContext vc : VCs ) { - if ( longestVC.getStart() != vc.getStart() ) - throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); - - if ( getSize(vc) > getSize(longestVC) ) - longestVC = vc; // get the longest location - - nFiltered += vc.isFiltered() ? 1 : 0; - if ( vc.isVariant() ) variantSources.add(vc.getSource()); - - AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); - remapped = remapped || alleleMapping.needsRemapping(); - - alleles.addAll(alleleMapping.values()); - - mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); - - // We always take the QUAL of the first VC with a non-MISSING qual for the combined value - if ( log10PError == CommonInfo.NO_LOG10_PERROR ) - log10PError = vc.getLog10PError(); - - filters.addAll(vc.getFilters()); - - // - // add attributes - // - // special case DP (add it up) and ID (just preserve it) - // - if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) - depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - if ( vc.hasID() ) rsIDs.add(vc.getID()); - if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { - String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); - // lets see if the string contains a , separator - if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { - List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); - for (String alleleCount : alleleCountArray) { - final int ac = Integer.valueOf(alleleCount.trim()); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } else { - final int ac = Integer.valueOf(rawAlleleCounts); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } - - for (final Map.Entry p : vc.getAttributes().entrySet()) { - String key = p.getKey(); - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - final boolean alreadyFound = attributes.containsKey(key); - final Object boundValue = attributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - - if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); - attributes.put(key, p.getValue()); - } - } - } - } - - // if we have more alternate alleles in the merged VC than in one or more of the - // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD - for ( final VariantContext vc : VCs ) { - if (vc.alleles.size() == 1) - continue; - if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && ! genotypes.isEmpty() ) { - System.err.println(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", - vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.alleles)); - } - genotypes = stripPLsAndAD(genotypes); - // this will remove stale AC,AF attributed from vc - calculateChromosomeCounts(vc, attributes, true); - break; - } - } - - // take the VC with the maxAC and pull the attributes into a modifiable map - if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { - attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); - } - - // if at least one record was unfiltered and we want a union, clear all of the filters - if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) - filters.clear(); - - - if ( annotateOrigin ) { // we care about where the call came from - String setValue; - if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered - setValue = MERGE_INTERSECTION; - else if ( nFiltered == VCs.size() ) // everything was filtered out - setValue = MERGE_FILTER_IN_ALL; - else if ( variantSources.isEmpty() ) // everyone was reference - setValue = MERGE_REF_IN_ALL; - else { - final LinkedHashSet s = new LinkedHashSet(); - for ( final VariantContext vc : VCs ) - if ( vc.isVariant() ) - s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); - setValue = GeneralUtils.join("-", s); - } - - if ( setKey != null ) { - attributes.put(setKey, setValue); - if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { - attributesWithMaxAC.put(setKey, setValue); - } - } - } - - if ( depth > 0 ) - attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - - final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : GeneralUtils.join(",", rsIDs); - - final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); - builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); - builder.alleles(alleles); - builder.genotypes(genotypes); - builder.log10PError(log10PError); - builder.filters(filters.isEmpty() ? filters : new TreeSet(filters)); - builder.attributes(new TreeMap(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); - - // Trim the padded bases of all alleles if necessary - final VariantContext merged = builder.make(); - if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); - return merged; - } - - private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { - final Iterator it1 = alleleSet1.iterator(); - final Iterator it2 = alleleSet2.iterator(); - - while ( it1.hasNext() && it2.hasNext() ) { - final Allele a1 = it1.next(); - final Allele a2 = it2.next(); - if ( ! a1.equals(a2) ) - return true; - } - - // by this point, at least one of the iterators is empty. All of the elements - // we've compared are equal up until this point. But it's possible that the - // sets aren't the same size, which is indicated by the test below. If they - // are of the same size, though, the sets are compatible - return it1.hasNext() || it2.hasNext(); - } - public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { // if all alleles of vc1 are a contained in alleles of vc2, return true if (!vc1.getReference().equals(vc2.getReference())) @@ -1138,16 +369,6 @@ public class VariantContextUtils { return true; } - public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { - GenotypesContext newGs = GenotypesContext.create(genotypes.size()); - - for ( final Genotype g : genotypes ) { - newGs.add(removePLsAndAD(g)); - } - - return newGs; - } - public static Map> separateVariantContextsByType(Collection VCs) { HashMap> mappedVCs = new HashMap>(); for ( VariantContext vc : VCs ) { @@ -1196,26 +417,7 @@ public class VariantContextUtils { return mappedVCs; } - private static class AlleleMapper { - private VariantContext vc = null; - private Map map = null; - public AlleleMapper(VariantContext vc) { this.vc = vc; } - public AlleleMapper(Map map) { this.map = map; } - public boolean needsRemapping() { return this.map != null; } - public Collection values() { return map != null ? map.values() : vc.getAlleles(); } - public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } - - public List remap(List as) { - List newAs = new ArrayList(); - for ( Allele a : as ) { - //System.out.printf(" Remapping %s => %s%n", a, remap(a)); - newAs.add(remap(a)); - } - return newAs; - } - } - -// TODO: remove that after testing + // TODO: remove that after testing // static private void verifyUniqueSampleNames(Collection unsortedVCs) { // Set names = new HashSet(); // for ( VariantContext vc : unsortedVCs ) { @@ -1230,117 +432,6 @@ public class VariantContextUtils { // } - static private Allele determineReferenceAllele(List VCs) { - Allele ref = null; - - for ( VariantContext vc : VCs ) { - Allele myRef = vc.getReference(); - if ( ref == null || ref.length() < myRef.length() ) - ref = myRef; - else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) - throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); - } - - return ref; - } - - static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { - if ( refAllele.equals(vc.getReference()) ) - return new AlleleMapper(vc); - else { - // we really need to do some work. The refAllele is the longest reference allele seen at this - // start site. So imagine it is: - // - // refAllele: ACGTGA - // myRef: ACGT - // myAlt: A - // - // We need to remap all of the alleles in vc to include the extra GA so that - // myRef => refAllele and myAlt => AGA - // - - Allele myRef = vc.getReference(); - if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); - byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); - -// System.out.printf("Remapping allele at %s%n", vc); -// System.out.printf("ref %s%n", refAllele); -// System.out.printf("myref %s%n", myRef ); -// System.out.printf("extrabases %s%n", new String(extraBases)); - - Map map = new HashMap(); - for ( Allele a : vc.getAlleles() ) { - if ( a.isReference() ) - map.put(a, refAllele); - else { - Allele extended = Allele.extend(a, extraBases); - for ( Allele b : allAlleles ) - if ( extended.equals(b) ) - extended = b; -// System.out.printf(" Extending %s => %s%n", a, extended); - map.put(a, extended); - } - } - - // debugging -// System.out.printf("mapping %s%n", map); - - return new AlleleMapper(map); - } - } - - static class CompareByPriority implements Comparator, Serializable { - List priorityListOfVCs; - public CompareByPriority(List priorityListOfVCs) { - this.priorityListOfVCs = priorityListOfVCs; - } - - private int getIndex(VariantContext vc) { - int i = priorityListOfVCs.indexOf(vc.getSource()); - if ( i == -1 ) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); - return i; - } - - public int compare(VariantContext vc1, VariantContext vc2) { - return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); - } - } - - public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { - if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) - throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); - - if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) - return new ArrayList(unsortedVCs); - else { - ArrayList sorted = new ArrayList(unsortedVCs); - Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); - return sorted; - } - } - - private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { - //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE - for ( Genotype g : oneVC.getGenotypes() ) { - String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); - if ( ! mergedGenotypes.containsSample(name) ) { - // only add if the name is new - Genotype newG = g; - - if ( uniqifySamples || alleleMapping.needsRemapping() ) { - final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); - newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); - } - - mergedGenotypes.add(newG); - } - } - } - - public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) { - return uniqify ? sampleName + "." + trackName : sampleName; - } - /** * Returns a context identical to this with the REF and ALT alleles reverse complemented. * @@ -1460,33 +551,4 @@ public class VariantContextUtils { } } - public static boolean requiresPaddingBase(final List alleles) { - - // see whether one of the alleles would be null if trimmed through - - for ( final String allele : alleles ) { - if ( allele.isEmpty() ) - return true; - } - - int clipping = 0; - Character currentBase = null; - - while ( true ) { - for ( final String allele : alleles ) { - if ( allele.length() - clipping == 0 ) - return true; - - char myBase = allele.charAt(clipping); - if ( currentBase == null ) - currentBase = myBase; - else if ( currentBase != myBase ) - return false; - } - - clipping++; - currentBase = null; - } - } - } diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/variant/vcf/VCFUtils.java index db2d47609..f61761652 100644 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/variant/vcf/VCFUtils.java @@ -28,15 +28,9 @@ package org.broadinstitute.variant.vcf; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.commons.io.FilenameUtils; -import org.broad.tribble.FeatureCodecHeader; -import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.utils.Pair; -import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; import java.util.*; public class VCFUtils { @@ -106,21 +100,6 @@ public class VCFUtils { return new HashSet(map.values()); } - public static String rsIDOfFirstRealVariant(List VCs, VariantContext.Type type) { - if ( VCs == null ) - return null; - - String rsID = null; - for ( VariantContext vc : VCs ) { - if ( vc.getType() == type ) { - rsID = vc.getID(); - break; - } - } - - return rsID; - } - /** * Add / replace the contig header lines in the VCFHeader with the in the reference file and master reference dictionary * @@ -198,35 +177,6 @@ public class VCFUtils { return assembly; } - /** - * Read all of the VCF records from source into memory, returning the header and the VariantContexts - * - * @param source the file to read, must be in VCF4 format - * @return - * @throws java.io.IOException - */ - public static Pair> readVCF(final File source) throws IOException { - // read in the features - final List vcs = new ArrayList(); - final VCFCodec codec = new VCFCodec(); - PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); - FeatureCodecHeader header = codec.readHeader(pbs); - pbs.close(); - - pbs = new PositionalBufferedStream(new FileInputStream(source)); - pbs.skip(header.getHeaderEnd()); - - final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); - - while ( ! pbs.isDone() ) { - final VariantContext vc = codec.decode(pbs); - if ( vc != null ) - vcs.add(vc); - } - - return new Pair>(vcfHeader, vcs); - } - /** Only displays a warning if warnings are enabled and an identical warning hasn't been already issued */ private static final class HeaderConflictWarner { boolean emitWarnings; diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 6c0a18f1d..eec0f653a 100644 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -35,7 +35,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.variant.bcf2.BCF2Utils; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.vcf.VCFCodec; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; diff --git a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java index a56045165..9d4c562c7 100644 --- a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.variant.utils.Pair; +import org.broadinstitute.sting.utils.collections.Pair; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java index 35fa47ede..eb2eebd36 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -36,11 +36,9 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.variant.utils.Pair; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextTestProvider; import org.broadinstitute.variant.vcf.VCFCodec; -import org.broadinstitute.variant.vcf.VCFHeader; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -252,13 +250,13 @@ public class BandPassActivityProfileUnitTest extends BaseTest { final File file = new File(path); final VCFCodec codec = new VCFCodec(); - final Pair> reader = VariantContextTestProvider.readAllVCs(file, codec); + final VariantContextTestProvider.VariantContextContainer reader = VariantContextTestProvider.readAllVCs(file, codec); final List incRegions = new ArrayList(); final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser); final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser); int pos = start; - for ( final VariantContext vc : reader.getSecond() ) { + for ( final VariantContext vc : reader.getVCs() ) { if ( vc == null ) continue; while ( pos < vc.getStart() ) { final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos); diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java similarity index 79% rename from public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 9b24f64e7..6ff052bdc 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2012 The Broad Institute -* +* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -9,10 +9,10 @@ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: -* +* * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. -* +* * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND @@ -23,33 +23,25 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.variant.variantcontext; +package org.broadinstitute.sting.utils.variant; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.utils.GeneralUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.variant.variantcontext.*; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; -import java.io.FileNotFoundException; import java.util.*; -public class VariantContextUtilsUnitTest extends VariantBaseTest { +public class GATKVariantContextUtilsUnitTest extends BaseTest { + Allele Aref, T, C, G, Cref, ATC, ATCATC; @BeforeSuite public void setup() { - final File referenceFile = new File(b37KGReference); - try { - IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(referenceFile); - } - catch(FileNotFoundException ex) { - throw new RuntimeException(referenceFile.getAbsolutePath(),ex); - } - // alleles Aref = Allele.create("A", true); Cref = Allele.create("C", true); @@ -186,10 +178,10 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { final List priority = vcs2priority(inputs); - final VariantContext merged = VariantContextUtils.simpleMerge( + final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, priority, - VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); Assert.assertEquals(merged.getAlleles(), cfg.expected); } @@ -244,10 +236,10 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); } - final VariantContext merged = VariantContextUtils.simpleMerge( + final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, null, - VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); Assert.assertEquals(merged.getID(), cfg.expected); } @@ -261,14 +253,14 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { List inputs; VariantContext expected; String setExpected; - VariantContextUtils.FilteredRecordMergeType type; + GATKVariantContextUtils.FilteredRecordMergeType type; private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, String setExpected) { - this(name, input1, input2, expected, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, setExpected); + this(name, input1, input2, expected, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, setExpected); } - private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, VariantContextUtils.FilteredRecordMergeType type, String setExpected) { + private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, GATKVariantContextUtils.FilteredRecordMergeType type, String setExpected) { super(MergeFilteredTest.class, name); LinkedList all = new LinkedList(Arrays.asList(input1, input2)); this.expected = expected; @@ -288,66 +280,66 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - VariantContextUtils.MERGE_INTERSECTION); + GATKVariantContextUtils.MERGE_INTERSECTION); new MergeFilteredTest("noFilters", makeVC("1", Arrays.asList(Aref, T), "."), makeVC("2", Arrays.asList(Aref, T), "."), makeVC("3", Arrays.asList(Aref, T), "."), - VariantContextUtils.MERGE_INTERSECTION); + GATKVariantContextUtils.MERGE_INTERSECTION); new MergeFilteredTest("oneFiltered", makeVC("1", Arrays.asList(Aref, T), "."), makeVC("2", Arrays.asList(Aref, T), "FAIL"), makeVC("3", Arrays.asList(Aref, T), "."), - String.format("1-%s2", VariantContextUtils.MERGE_FILTER_PREFIX)); + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); new MergeFilteredTest("onePassOneFail", makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), makeVC("2", Arrays.asList(Aref, T), "FAIL"), makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - String.format("1-%s2", VariantContextUtils.MERGE_FILTER_PREFIX)); + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); new MergeFilteredTest("AllFiltered", makeVC("1", Arrays.asList(Aref, T), "FAIL"), makeVC("2", Arrays.asList(Aref, T), "FAIL"), makeVC("3", Arrays.asList(Aref, T), "FAIL"), - VariantContextUtils.MERGE_FILTER_IN_ALL); + GATKVariantContextUtils.MERGE_FILTER_IN_ALL); // test ALL vs. ANY new MergeFilteredTest("FailOneUnfiltered", makeVC("1", Arrays.asList(Aref, T), "FAIL"), makeVC("2", Arrays.asList(Aref, T), "."), makeVC("3", Arrays.asList(Aref, T), "."), - VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - String.format("%s1-2", VariantContextUtils.MERGE_FILTER_PREFIX)); + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); new MergeFilteredTest("OneFailAllUnfilteredArg", makeVC("1", Arrays.asList(Aref, T), "FAIL"), makeVC("2", Arrays.asList(Aref, T), "."), makeVC("3", Arrays.asList(Aref, T), "FAIL"), - VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ALL_UNFILTERED, - String.format("%s1-2", VariantContextUtils.MERGE_FILTER_PREFIX)); + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ALL_UNFILTERED, + String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); // test excluding allele in filtered record new MergeFilteredTest("DontIncludeAlleleOfFilteredRecords", makeVC("1", Arrays.asList(Aref, T), "."), makeVC("2", Arrays.asList(Aref, T), "FAIL"), makeVC("3", Arrays.asList(Aref, T), "."), - String.format("1-%s2", VariantContextUtils.MERGE_FILTER_PREFIX)); + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); // promotion of site from unfiltered to PASSES new MergeFilteredTest("UnfilteredPlusPassIsPass", makeVC("1", Arrays.asList(Aref, T), "."), makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - VariantContextUtils.MERGE_INTERSECTION); + GATKVariantContextUtils.MERGE_INTERSECTION); new MergeFilteredTest("RefInAll", makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), makeVC("2", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), makeVC("3", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - VariantContextUtils.MERGE_REF_IN_ALL); + GATKVariantContextUtils.MERGE_REF_IN_ALL); new MergeFilteredTest("RefInOne", makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), @@ -361,8 +353,8 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { @Test(dataProvider = "mergeFiltered") public void testMergeFiltered(MergeFilteredTest cfg) { final List priority = vcs2priority(cfg.inputs); - final VariantContext merged = VariantContextUtils.simpleMerge( - cfg.inputs, priority, cfg.type, VariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -487,9 +479,9 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { @Test(dataProvider = "mergeGenotypes") public void testMergeGenotypes(MergeGenotypesTest cfg) { - final VariantContext merged = VariantContextUtils.simpleMerge( - cfg.inputs, cfg.priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - VariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -528,9 +520,9 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); - final VariantContext merged = VariantContextUtils.simpleMerge( - Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - VariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); // test genotypes Assert.assertEquals(merged.getSampleNames(), new HashSet(Arrays.asList("s1.1", "s1.2"))); @@ -561,12 +553,12 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); - final VariantContext merged = VariantContextUtils.simpleMerge( - Arrays.asList(vc1, vc2), priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - VariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); if ( annotate ) - Assert.assertEquals(merged.getAttribute(set), VariantContextUtils.MERGE_INTERSECTION); + Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); else Assert.assertFalse(merged.hasAttribute(set)); } @@ -583,78 +575,6 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { return priority; } - - // -------------------------------------------------------------------------------- - // - // Test repeats - // - // -------------------------------------------------------------------------------- - - private class RepeatDetectorTest extends TestDataProvider { - String ref; - boolean isTrueRepeat; - VariantContext vc; - - private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { - super(RepeatDetectorTest.class); - this.isTrueRepeat = isTrueRepeat; - this.ref = ref; - - List alleles = new LinkedList(); - final Allele refAllele = Allele.create(refAlleleString, true); - alleles.add(refAllele); - for ( final String altString: altAlleleStrings) { - final Allele alt = Allele.create(altString, false); - alleles.add(alt); - } - - VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); - this.vc = builder.make(); - } - - public String toString() { - return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc); - } - } - - @DataProvider(name = "RepeatDetectorTest") - public Object[][] makeRepeatDetectorTest() { - new RepeatDetectorTest(true, "NAAC", "N", "NA"); - new RepeatDetectorTest(true, "NAAC", "NA", "N"); - new RepeatDetectorTest(false, "NAAC", "NAA", "N"); - new RepeatDetectorTest(false, "NAAC", "N", "NC"); - new RepeatDetectorTest(false, "AAC", "A", "C"); - - // running out of ref bases => false - new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); - - // complex repeats - new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); - new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); - new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); - new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); - new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); - - // multi-allelic - new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As - new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false - new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false - new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false - - return RepeatDetectorTest.getTests(RepeatDetectorTest.class); - } - - @Test(dataProvider = "RepeatDetectorTest") - public void testRepeatDetectorTest(RepeatDetectorTest cfg) { - - // test alleles are equal - Assert.assertEquals(VariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); - } - // -------------------------------------------------------------------------------- // // basic allele clipping test @@ -698,10 +618,11 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { @Test(dataProvider = "ReverseClippingPositionTestProvider") public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { - int result = VariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false); + int result = GATKVariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false); Assert.assertEquals(result, cfg.expectedClip); } + // -------------------------------------------------------------------------------- // // test splitting into bi-allelics @@ -776,7 +697,7 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { @Test(dataProvider = "SplitBiallelics") public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { - final List biallelics = VariantContextUtils.splitVariantContextToBiallelics(vc); + final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); for ( int i = 0; i < biallelics.size(); i++ ) { final VariantContext actual = biallelics.get(i); @@ -790,14 +711,14 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { final List genotypes = new ArrayList(); int sampleI = 0; - for ( final List alleles : GeneralUtils.makePermutations(vc.getAlleles(), 2, true) ) { + for ( final List alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) { genotypes.add(GenotypeBuilder.create("sample" + sampleI++, alleles)); } genotypes.add(GenotypeBuilder.createMissing("missing", 2)); final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make(); - final List biallelics = VariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); + final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); for ( int i = 0; i < biallelics.size(); i++ ) { final VariantContext actual = biallelics.get(i); Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples @@ -812,4 +733,159 @@ public class VariantContextUtilsUnitTest extends VariantBaseTest { } } } -} \ No newline at end of file + + + // -------------------------------------------------------------------------------- + // + // Test repeats + // + // -------------------------------------------------------------------------------- + + private class RepeatDetectorTest extends TestDataProvider { + String ref; + boolean isTrueRepeat; + VariantContext vc; + + private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { + super(RepeatDetectorTest.class); + this.isTrueRepeat = isTrueRepeat; + this.ref = ref; + + List alleles = new LinkedList(); + final Allele refAllele = Allele.create(refAlleleString, true); + alleles.add(refAllele); + for ( final String altString: altAlleleStrings) { + final Allele alt = Allele.create(altString, false); + alleles.add(alt); + } + + VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); + this.vc = builder.make(); + } + + public String toString() { + return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc); + } + } + + @DataProvider(name = "RepeatDetectorTest") + public Object[][] makeRepeatDetectorTest() { + new RepeatDetectorTest(true, "NAAC", "N", "NA"); + new RepeatDetectorTest(true, "NAAC", "NA", "N"); + new RepeatDetectorTest(false, "NAAC", "NAA", "N"); + new RepeatDetectorTest(false, "NAAC", "N", "NC"); + new RepeatDetectorTest(false, "AAC", "A", "C"); + + // running out of ref bases => false + new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); + + // complex repeats + new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); + + // multi-allelic + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false + + return RepeatDetectorTest.getTests(RepeatDetectorTest.class); + } + + @Test(dataProvider = "RepeatDetectorTest") + public void testRepeatDetectorTest(RepeatDetectorTest cfg) { + + // test alleles are equal + Assert.assertEquals(GATKVariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); + } + + @Test + public void testRepeatAllele() { + Allele nullR = Allele.create("A", true); + Allele nullA = Allele.create("A", false); + Allele atc = Allele.create("AATC", false); + Allele atcatc = Allele.create("AATCATC", false); + Allele ccccR = Allele.create("ACCCC", true); + Allele cc = Allele.create("ACC", false); + Allele cccccc = Allele.create("ACCCCCC", false); + Allele gagaR = Allele.create("AGAGA", true); + Allele gagagaga = Allele.create("AGAGAGAGA", false); + + // - / ATC [ref] from 20-22 + String delLoc = "chr1"; + int delLocStart = 20; + int delLocStop = 22; + + // - [ref] / ATC from 20-20 + String insLoc = "chr1"; + int insLocStart = 20; + int insLocStop = 20; + + Pair,byte[]> result; + byte[] refBytes = "TATCATCATCGGA".getBytes(); + + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("T".getBytes(), "T".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); + + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7); + + + // A*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4 + VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,3); + + // ATC*,A,ATCATC + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+3, Arrays.asList(Allele.create("AATC", true),nullA,atcatc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],2); + Assert.assertEquals(result.getFirst().toArray()[2],4); + Assert.assertEquals(result.getSecond().length,3); + + // simple non-tandem deletion: CCCC*, - + refBytes = "TCCCCCCCCATG".getBytes(); + vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],8); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,1); + + // CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9 + refBytes = "TCCCCCCCAGAGAGAG".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(ccccR,cc, nullA,cccccc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],7); + Assert.assertEquals(result.getFirst().toArray()[1],5); + Assert.assertEquals(result.getFirst().toArray()[2],3); + Assert.assertEquals(result.getFirst().toArray()[3],9); + Assert.assertEquals(result.getSecond().length,1); + + // GAGA*,-,GAGAGAGA + refBytes = "TGAGAGAGAGATTT".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(gagaR, nullA,gagagaga)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],5); + Assert.assertEquals(result.getFirst().toArray()[1],3); + Assert.assertEquals(result.getFirst().toArray()[2],7); + Assert.assertEquals(result.getSecond().length,2); + + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java index 8a6d2138e..51a47d86d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java @@ -231,9 +231,9 @@ public class VariantContextBenchmark extends SimpleBenchmark { toMerge.add(new VariantContextBuilder(vc).genotypes(gc).make()); } - VariantContextUtils.simpleMerge(toMerge, null, - VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - VariantContextUtils.GenotypeMergeType.UNSORTED, + GATKVariantContextUtils.simpleMerge(toMerge, null, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, true, false, "set", false, true); } }; diff --git a/public/java/test/org/broadinstitute/variant/VariantBaseTest.java b/public/java/test/org/broadinstitute/variant/VariantBaseTest.java index be70b22e6..0d7d5a82e 100644 --- a/public/java/test/org/broadinstitute/variant/VariantBaseTest.java +++ b/public/java/test/org/broadinstitute/variant/VariantBaseTest.java @@ -1,3 +1,28 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + package org.broadinstitute.variant; import org.testng.Assert; diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java index d739e4aa5..4c948e8e2 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java @@ -32,7 +32,6 @@ import org.broadinstitute.variant.VariantBaseTest; import org.broadinstitute.variant.bcf2.BCF2Codec; import org.broadinstitute.variant.utils.GeneralUtils; import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.variant.utils.Pair; import org.broadinstitute.variant.variantcontext.writer.Options; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.testng.Assert; @@ -74,6 +73,24 @@ public class VariantContextTestProvider { } } + public static class VariantContextContainer { + private VCFHeader header; + private Iterable vcs; + + public VariantContextContainer( VCFHeader header, Iterable vcs ) { + this.header = header; + this.vcs = vcs; + } + + public VCFHeader getHeader() { + return header; + } + + public Iterable getVCs() { + return vcs; + } + } + public abstract static class VariantContextIOTest { public String toString() { return "VariantContextIOTest:" + getExtension(); @@ -150,15 +167,15 @@ public class VariantContextTestProvider { if ( ENABLE_SOURCE_VCF_TESTS ) { for ( final File file : testSourceVCFs ) { VCFCodec codec = new VCFCodec(); - Pair> x = readAllVCs( file, codec ); + VariantContextContainer x = readAllVCs( file, codec ); List fullyDecoded = new ArrayList(); - for ( final VariantContext raw : x.getSecond() ) { + for ( final VariantContext raw : x.getVCs() ) { if ( raw != null ) - fullyDecoded.add(raw.fullyDecode(x.getFirst(), false)); + fullyDecoded.add(raw.fullyDecode(x.getHeader(), false)); } - TEST_DATAs.add(new VariantContextTestData(x.getFirst(), fullyDecoded)); + TEST_DATAs.add(new VariantContextTestData(x.getHeader(), fullyDecoded)); } } } @@ -616,8 +633,8 @@ public class VariantContextTestProvider { writeVCsToFile(writer, header, data.vcs); // ensure writing of expected == actual - final Pair> p = readAllVCs(tmpFile, tester.makeCodec()); - final Iterable actual = p.getSecond(); + final VariantContextContainer p = readAllVCs(tmpFile, tester.makeCodec()); + final Iterable actual = p.getVCs(); int i = 0; for ( final VariantContext readVC : actual ) { @@ -655,14 +672,14 @@ public class VariantContextTestProvider { writeVCsToFile(writer, header, vcs); // ensure writing of expected == actual - final Pair> p = readAllVCs(tmpFile, tester.makeCodec()); - final Iterable actual = p.getSecond(); + final VariantContextContainer p = readAllVCs(tmpFile, tester.makeCodec()); + final Iterable actual = p.getVCs(); assertEquals(actual, expected); if ( recurse ) { // if we are doing a recursive test, grab a fresh iterator over the written values - final Iterable read = readAllVCs(tmpFile, tester.makeCodec()).getSecond(); - testReaderWriter(tester, p.getFirst(), expected, read, false); + final Iterable read = readAllVCs(tmpFile, tester.makeCodec()).getVCs(); + testReaderWriter(tester, p.getHeader(), expected, read, false); } } @@ -683,7 +700,7 @@ public class VariantContextTestProvider { * @return * @throws IOException */ - public final static Pair> readAllVCs( final File source, final FeatureCodec codec ) throws IOException { + public final static VariantContextContainer readAllVCs( final File source, final FeatureCodec codec ) throws IOException { // read in the features PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); FeatureCodecHeader header = codec.readHeader(pbs); @@ -693,7 +710,7 @@ public class VariantContextTestProvider { pbs.skip(header.getHeaderEnd()); final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); - return new Pair>(vcfHeader, new VCIterable(pbs, codec, vcfHeader)); + return new VariantContextContainer(vcfHeader, new VCIterable(pbs, codec, vcfHeader)); } public static class VCIterable implements Iterable, Iterator { @@ -738,10 +755,10 @@ public class VariantContextTestProvider { } public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { - final Pair> vcfData = readAllVCs(vcfFile, new VCFCodec()); - final Pair> bcfData = readAllVCs(bcfFile, new BCF2Codec()); - assertEquals(bcfData.getFirst(), vcfData.getFirst()); - assertEquals(bcfData.getSecond(), vcfData.getSecond()); + final VariantContextContainer vcfData = readAllVCs(vcfFile, new VCFCodec()); + final VariantContextContainer bcfData = readAllVCs(bcfFile, new BCF2Codec()); + assertEquals(bcfData.getHeader(), vcfData.getHeader()); + assertEquals(bcfData.getVCs(), vcfData.getVCs()); } public static void assertEquals(final Iterable actual, final Iterable expected) { diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java index 25e9878ae..103c8ab3b 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java @@ -28,9 +28,7 @@ package org.broadinstitute.variant.variantcontext; // the imports for unit testing. - import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.utils.Pair; import org.testng.annotations.BeforeSuite; import org.testng.annotations.BeforeMethod; import org.testng.annotations.DataProvider; @@ -484,78 +482,6 @@ public class VariantContextUnitTest extends VariantBaseTest { Assert.assertNotNull(vc.getFiltersMaybeNull()); } - @Test - public void testRepeatAllele() { - Allele nullR = Allele.create("A", true); - Allele nullA = Allele.create("A", false); - Allele atc = Allele.create("AATC", false); - Allele atcatc = Allele.create("AATCATC", false); - Allele ccccR = Allele.create("ACCCC", true); - Allele cc = Allele.create("ACC", false); - Allele cccccc = Allele.create("ACCCCCC", false); - Allele gagaR = Allele.create("AGAGA", true); - Allele gagagaga = Allele.create("AGAGAGAGA", false); - - Pair,byte[]> result; - byte[] refBytes = "TATCATCATCGGA".getBytes(); - - Assert.assertEquals(VariantContextUtils.findNumberofRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); - Assert.assertEquals(VariantContextUtils.findNumberofRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); - Assert.assertEquals(VariantContextUtils.findNumberofRepetitions("T".getBytes(), "T".getBytes(), true),1); - Assert.assertEquals(VariantContextUtils.findNumberofRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); - Assert.assertEquals(VariantContextUtils.findNumberofRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); - - Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); - Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1); - Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7); - Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2); - Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4); - Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7); - - - // A*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4 - VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make(); - result = VariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],3); - Assert.assertEquals(result.getFirst().toArray()[1],4); - Assert.assertEquals(result.getSecond().length,3); - - // ATC*,A,ATCATC - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+3, Arrays.asList(Allele.create("AATC", true),nullA,atcatc)).make(); - result = VariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],3); - Assert.assertEquals(result.getFirst().toArray()[1],2); - Assert.assertEquals(result.getFirst().toArray()[2],4); - Assert.assertEquals(result.getSecond().length,3); - - // simple non-tandem deletion: CCCC*, - - refBytes = "TCCCCCCCCATG".getBytes(); - vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make(); - result = VariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],8); - Assert.assertEquals(result.getFirst().toArray()[1],4); - Assert.assertEquals(result.getSecond().length,1); - - // CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9 - refBytes = "TCCCCCCCAGAGAGAG".getBytes(); - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(ccccR,cc, nullA,cccccc)).make(); - result = VariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],7); - Assert.assertEquals(result.getFirst().toArray()[1],5); - Assert.assertEquals(result.getFirst().toArray()[2],3); - Assert.assertEquals(result.getFirst().toArray()[3],9); - Assert.assertEquals(result.getSecond().length,1); - - // GAGA*,-,GAGAGAGA - refBytes = "TGAGAGAGAGATTT".getBytes(); - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(gagaR, nullA,gagagaga)).make(); - result = VariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],5); - Assert.assertEquals(result.getFirst().toArray()[1],3); - Assert.assertEquals(result.getFirst().toArray()[2],7); - Assert.assertEquals(result.getSecond().length,2); - - } @Test public void testGetGenotypeCounts() { List alleles = Arrays.asList(Aref, T); From 29fd536c282ce97b64b43886b103d872ca2abd9d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 29 Jan 2013 17:27:53 -0500 Subject: [PATCH 06/13] Updating licenses manually Please check that your commit hook is properly pointing at ../../private/shell/pre-commit Conflicts: public/java/test/org/broadinstitute/variant/VariantBaseTest.java --- .../covariates/RepeatCovariate.java | 30 ++++----- .../covariates/RepeatUnitCovariate.java | 30 ++++----- .../reducereads/SlidingWindowUnitTest.java | 65 ++++++++++++------- .../RepeatCovariatesUnitTest.java | 31 ++++----- .../sting/utils/SWPairwiseAlignment.java | 50 +++++++------- .../variant/VariantBaseTest.java | 6 +- 6 files changed, 117 insertions(+), 95 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java index 546bd6ac8..b6b954c4a 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java @@ -1,45 +1,45 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java index b32feb9a3..ef0d3fc62 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java @@ -1,45 +1,45 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java index cfb8c53b4..3d3c4ad24 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -1,26 +1,47 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java index 7ded176bb..c0cb2d7b6 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java @@ -1,48 +1,49 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ + package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Requires; diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java index e2edf7421..7bd937af9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java @@ -1,28 +1,28 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + package org.broadinstitute.sting.utils; import net.sf.samtools.Cigar; diff --git a/public/java/test/org/broadinstitute/variant/VariantBaseTest.java b/public/java/test/org/broadinstitute/variant/VariantBaseTest.java index 0d7d5a82e..6cec4d40b 100644 --- a/public/java/test/org/broadinstitute/variant/VariantBaseTest.java +++ b/public/java/test/org/broadinstitute/variant/VariantBaseTest.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2012 The Broad Institute -* +* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -9,10 +9,10 @@ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: -* +* * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. -* +* * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND From 6449c320b482132bf1b3c29aa744607f488af781 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 29 Jan 2013 20:30:36 -0500 Subject: [PATCH 07/13] Fix the CachingIndexedFastaSequenceFileUnitTest BaseUtils.convertIUPACtoN() no longer throws a UserException, since it's in org.broadinstitute.variant --- .../utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index 0c1b5b069..b65811103 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -252,7 +252,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { Assert.assertEquals(changingNs, preservingNs + 4); } - @Test(enabled = true, expectedExceptions = {UserException.class}) + @Test(enabled = true, expectedExceptions = {IllegalStateException.class}) public void testFailOnBadBase() throws FileNotFoundException, InterruptedException { final String testFasta = privateTestDir + "problematicFASTA.fasta"; final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta)); From 3d9a83c7595a4519022611f128e1adc04e2901ce Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 29 Jan 2013 22:36:31 -0500 Subject: [PATCH 09/13] BaseCoverageDistributions should be 'by reference' otherwise we miss all the 0 coverage spots. --- .../walkers/diagnostics/targets/BaseCoverageDistribution.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java index a12008eb4..281c1c55d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java @@ -51,6 +51,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import java.io.PrintStream; @@ -62,6 +64,7 @@ import java.util.Map; * Date: 1/27/13 * Time: 11:16 AM */ +@By(DataSource.REFERENCE) public class BaseCoverageDistribution extends LocusWalker> { @Output(required = true) private PrintStream out; From 92c5635e196d3665e246782477634e4e4a6955bc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 28 Jan 2013 11:53:43 -0500 Subject: [PATCH 10/13] Cleanup, document, and unit test ActiveRegion -- All functions tested. In the testing / review I discovered several bugs in the ActiveRegion routines that manipulate reads. New version should be correct -- Enforce correct ordering of supporting states in constructor -- Enforce read ordering when adding reads to an active region in add -- Fix bug in HaplotypeCaller map with new updating read spans. Now get the full span before clipping down reads in map, so that variants are correctly placed w.r.t. the full reference sequence -- Encapsulate isActive field with an accessor function -- Make sure that all state lists are unmodifiable, and that the docs are clear about this -- ActiveRegion equalsExceptReads is for testing only, so make it package protected -- ActiveRegion.hardClipToRegion must resort reads as they can become out of order -- Previous version of HC clipped reads but, due to clipping, these reads could no longer overlap the active region. The old version of HC kept these reads, while the enforced contracts on the ActiveRegion detected this was a problem and those reads are removed. Has a minor impact on PLs and RankSumTest values -- Updating HaplotypeCaller MD5s to reflect changes to ActiveRegions read inclusion policy --- .../targets/FindCoveredIntervals.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 32 +- .../HaplotypeCallerIntegrationTest.java | 24 +- .../traversals/TraverseActiveRegions.java | 6 +- .../utils/activeregion/ActiveRegion.java | 343 +++++++++++++++--- .../sting/utils/sam/ArtificialSAMUtils.java | 13 + .../activeregion/ActiveRegionUnitTest.java | 223 ++++++++++++ .../activeregion/ActivityProfileUnitTest.java | 2 +- .../BandPassActivityProfileUnitTest.java | 4 +- 9 files changed, 569 insertions(+), 80 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index 3712a8e51..ac028d860 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -85,7 +85,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker { @Override public GenomeLoc map(final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker tracker) { - if (activeRegion.isActive) + if (activeRegion.isActive()) return activeRegion.getLocation(); else return null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 8b3eb9f1b..a27eaac8e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -463,15 +463,19 @@ public class HaplotypeCaller extends ActiveRegionWalker implem allelesToGenotype.removeAll( activeAllelesToGenotype ); } - if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do! + if( !activeRegion.isActive()) { return 0; } // Not active so nothing to do! if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails + + // note this operation must be performed before we clip the reads down, as this must correspond to the full reference region + final GenomeLoc fullSpanBeforeClipping = getPaddedLoc(activeRegion); + final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING); //int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion )); - final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype ); + final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, fullSpanBeforeClipping, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! activeRegion.hardClipToActiveRegion(); // only evaluate the parts of reads that are overlapping the active region @@ -495,7 +499,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem stratifiedReadMap, perSampleFilteredReadList, fullReferenceWithPadding, - getPaddedLoc(activeRegion), + fullSpanBeforeClipping, activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { @@ -505,9 +509,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if ( bamWriter != null ) { // write the haplotypes to the bam - final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion); for ( Haplotype haplotype : haplotypes ) - writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype)); + writeHaplotype(haplotype, fullSpanBeforeClipping, bestHaplotypes.contains(haplotype)); // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); @@ -519,7 +522,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedRefLoc.getStart()); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), fullSpanBeforeClipping.getStart()); } } } @@ -559,7 +562,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final ArrayList finalizedReadList = new ArrayList(); - final FragmentCollection fragmentCollection = FragmentUtils.create( ReadUtils.sortReadsByCoordinate(activeRegion.getReads()) ); + final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); activeRegion.clearReads(); // Join overlapping paired reads to create a single longer read @@ -571,17 +574,20 @@ public class HaplotypeCaller extends ActiveRegionWalker implem Collections.shuffle(finalizedReadList, GenomeAnalysisEngine.getRandomGenerator()); // Loop through the reads hard clipping the adaptor and low quality tails + final ArrayList readsToUse = new ArrayList(finalizedReadList.size()); for( final GATKSAMRecord myRead : finalizedReadList ) { final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // protect against INTERVALS with abnormally high coverage - // BUGBUG: remove when positional downsampler is hooked up to ART/HC - if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { - activeRegion.add(clippedRead); + // TODO BUGBUG: remove when positional downsampler is hooked up to ART/HC + if( activeRegion.readOverlapsRegion(clippedRead) && + clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { + readsToUse.add(clippedRead); } } } + activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse)); } private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { @@ -596,9 +602,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { - final int padLeft = Math.max(activeRegion.getReferenceLoc().getStart()-REFERENCE_PADDING, 1); - final int padRight = Math.min(activeRegion.getReferenceLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReferenceLoc().getContig()).getSequenceLength()); - return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReferenceLoc().getContig(), padLeft, padRight); + final int padLeft = Math.max(activeRegion.getReadSpanLoc().getStart()-REFERENCE_PADDING, 1); + final int padRight = Math.min(activeRegion.getReadSpanLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReadSpanLoc().getContig()).getSequenceLength()); + return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReadSpanLoc().getContig(), padLeft, padRight); } private HashMap> splitReadsBySample( final List reads ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 97b9ce746..d446da830 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "11290b619bc79b629cf29b8f428254ce"); + HCTest(CEUTRIO_BAM, "", "664a14590d0966e63d3aabff2d7bab0a"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "897abb2b4f98e9e460f373f9e0db5033"); + HCTest(NA12878_BAM, "", "111f3dc086a3cea1be9bd1ad6e1d18ed"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "efc2cae94069a1d6ee5fdcc7afeaa0ed"); + "c70f753f7918a1c670ce4ed5c66de09e"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -96,13 +96,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "01f42c311fc3ce4f07ef86f8c01facfb"); + "b1d3070f0c49becf34101e480ab6c589"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "4c117c84d1abeade1dee3f7b52a4a585"); + "20eba2e54266f6aebf35b7b7b7e754e3"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "939847eb7bbafc798916acffdb1b5697"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9805488d85e59e1ae002d0d32d7011d"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -124,7 +124,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "25806874242973f00fb6f2a320ed4d9c"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "4544a2916f46f58b32db8008776b91a3"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c50b06d56cf3d0ef53e73a4973207949"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f3984a91e7562494c2a7e41fd05a6734"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -146,14 +146,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ae2470e294d99ff2b825281b84730c72")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3e9e025c539be6c5e0d0f2e5d8e86a62")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("6f18ae64bf466476d780a083dcb5fc43")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("34129e6c6310ef4eeeeb59b0e7ac0464")); executeTest("HCTestStructuralIndels: ", spec); } @@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("ecdb8e30ec5dd91efc179ab6732499f9")); + Arrays.asList("5f4c07aaf1d2d34cccce43196a5fbd71")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("36a90309dde1a325c274388e302ffaa5")); + Arrays.asList("6ead001b1f8e7cb433fd335f78fde5f0")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 6933b45a7..5d2aa6be3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -460,7 +460,7 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive() ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReadSpanLoc()); } if ( LOG_READ_CARRYING ) logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s", - activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive, activeRegion.size(), maxReadsInMemory)); + activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive(), activeRegion.size(), maxReadsInMemory)); final M x = walker.map(activeRegion, null); return walker.reduce( x, sum ); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 194f61933..13add5e7d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -25,37 +25,105 @@ package org.broadinstitute.sting.utils.activeregion; -import com.google.java.contract.Requires; +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.List; /** - * Created by IntelliJ IDEA. + * Represents a single active region created by the Active Region Traversal for processing + * + * An active region is a single contiguous span of bases on the genome that should be operated + * on as a single unit for the active region traversal. The action may contains a list of + * reads that overlap the region (may because there may be no reads in the region). The region + * is tagged as being either active or inactive, depending on the probabilities provided by + * the isActiveProb results from the ART walker. Each region carries with it the + * exact span of the region (bases which are the core of the isActiveProbs from the walker) as + * well as an extended size, that includes the ART walker's extension size. Reads in the region + * provided by ART include all reads overlapping the extended span, not the raw span. + * * User: rpoplin * Date: 1/4/12 */ - +@Invariant({ + "extension >= 0", + "activeRegionLoc != null", + "genomeLocParser != null", + "spanIncludingReads != null", + "extendedLoc != null" +}) public class ActiveRegion implements HasGenomeLocation { - + /** + * The reads included in this active region. May be empty upon creation, and expand / contract + * as reads are added or removed from this region. + */ private final ArrayList reads = new ArrayList(); - private final List supportingStates; - private final GenomeLoc activeRegionLoc; - private final GenomeLoc extendedLoc; - private final int extension; - private GenomeLoc fullExtentReferenceLoc = null; - private final GenomeLocParser genomeLocParser; - public final boolean isActive; + /** + * An ordered list (by genomic coordinate) of the ActivityProfileStates that went + * into this active region. May be empty, which says that no supporting states were + * provided when this region was created. + */ + private final List supportingStates; + + /** + * The raw span of this active region, not including the active region extension + */ + private final GenomeLoc activeRegionLoc; + + /** + * The span of this active region on the genome, including the active region extension + */ + private final GenomeLoc extendedLoc; + + /** + * The extension, in bp, of this active region. + */ + private final int extension; + + /** + * A genomeLocParser so we can create genomeLocs + */ + private final GenomeLocParser genomeLocParser; + + /** + * Does this region represent an active region (all isActiveProbs above threshold) or + * an inactive region (all isActiveProbs below threshold)? + */ + private final boolean isActive; + + /** + * The span of this active region, including the bp covered by all reads in this + * region. This union of extensionLoc and the loc of all reads in this region. + * + * Must be at least as large as extendedLoc, but may be larger when reads + * partially overlap this region. + */ + private GenomeLoc spanIncludingReads; + + /** + * Create a new ActiveRegion containing no reads + * + * @param activeRegionLoc the span of this active region + * @param supportingStates the states that went into creating this region, or null / empty if none are available. + * If not empty, must have exactly one state for each bp in activeRegionLoc + * @param isActive indicates whether this is an active region, or an inactve one + * @param genomeLocParser a non-null parser to let us create new genome locs + * @param extension the active region extension to use for this active region + */ public ActiveRegion( final GenomeLoc activeRegionLoc, final List supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { if ( activeRegionLoc == null ) throw new IllegalArgumentException("activeRegionLoc cannot be null"); + if ( activeRegionLoc.size() == 0 ) throw new IllegalArgumentException("Active region cannot be of zero size, but got " + activeRegionLoc); if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); if ( extension < 0 ) throw new IllegalArgumentException("extension cannot be < 0 but got " + extension); @@ -64,75 +132,254 @@ public class ActiveRegion implements HasGenomeLocation { this.isActive = isActive; this.genomeLocParser = genomeLocParser; this.extension = extension; - extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); - fullExtentReferenceLoc = extendedLoc; + this.extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + this.spanIncludingReads = extendedLoc; + + if ( ! this.supportingStates.isEmpty() ) { + if ( this.supportingStates.size() != activeRegionLoc.size() ) + throw new IllegalArgumentException("Supporting states wasn't empty but it doesn't have exactly one state per bp in the active region: states " + this.supportingStates.size() + " vs. bp in region = " + activeRegionLoc.size()); + GenomeLoc lastStateLoc = null; + for ( final ActivityProfileState state : this.supportingStates ) { + if ( lastStateLoc != null ) { + if ( state.getLoc().getStart() != lastStateLoc.getStart() + 1 || state.getLoc().getContigIndex() != lastStateLoc.getContigIndex()) + throw new IllegalArgumentException("Supporting state has an invalid sequence: last state was " + lastStateLoc + " but next state was " + state); + } + lastStateLoc = state.getLoc(); + } + } } @Override public String toString() { - return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive + " nReads=" + reads.size() + " "; + return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size() + " "; } - // add each read to the bin and extend the reference genome activeRegionLoc if needed - public void add( final GATKSAMRecord read ) { - fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); - reads.add( read ); - } - - public void hardClipToActiveRegion() { - final ArrayList clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() ); - reads.clear(); - reads.addAll(clippedReads); - } - - public ArrayList getReads() { return reads; } - - @Requires("referenceReader.isUppercasingBases()") - public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) { + /** + * See #getActiveRegionReference but with padding == 0 + */ + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { return getActiveRegionReference(referenceReader, 0); } - @Requires("referenceReader.isUppercasingBases()") - public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { - return getReference( referenceReader, padding, extendedLoc ); + /** + * Get the reference bases from referenceReader spanned by the extended location of this active region, + * including additional padding bp on either side. If this expanded region would exceed the boundaries + * of the active region's contig, the returned result will be truncated to only include on-genome reference + * bases + * @param referenceReader the source of the reference genome bases + * @param padding the padding, in BP, we want to add to either side of this active region extended region + * @return a non-null array of bytes holding the reference bases in referenceReader + */ + @Ensures("result != null") + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return getReference(referenceReader, padding, extendedLoc); } - @Requires("referenceReader.isUppercasingBases()") - public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) { + /** + * See #getActiveRegionReference but using the span including regions not the extended span + */ + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { return getFullReference(referenceReader, 0); } - @Requires("referenceReader.isUppercasingBases()") - public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { - return getReference( referenceReader, padding, fullExtentReferenceLoc ); + /** + * See #getActiveRegionReference but using the span including regions not the extended span + */ + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return getReference(referenceReader, padding, spanIncludingReads); } - @Requires("referenceReader.isUppercasingBases()") - private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + /** + * Get the reference bases from referenceReader spanned by the extended location of this active region, + * including additional padding bp on either side. If this expanded region would exceed the boundaries + * of the active region's contig, the returned result will be truncated to only include on-genome reference + * bases + * @param referenceReader the source of the reference genome bases + * @param padding the padding, in BP, we want to add to either side of this active region extended region + * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for + * @return a non-null array of bytes holding the reference bases in referenceReader + */ + @Ensures("result != null") + private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); + if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); + if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); + if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc); + final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), Math.max(1, genomeLoc.getStart() - padding), Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); + return reference; } + /** + * Get the raw span of this active region (excluding the extension) + * @return a non-null genome loc + */ @Override + @Ensures("result != null") public GenomeLoc getLocation() { return activeRegionLoc; } + + /** + * Get the span of this active region including the extension value + * @return a non-null GenomeLoc + */ + @Ensures("result != null") public GenomeLoc getExtendedLoc() { return extendedLoc; } - public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; } - public List getSupportingStates() { return supportingStates; } + /** + * Get the span of this active region including the extension and the projects on the + * genome of all reads in this active region. That is, returns the bp covered by this + * region and all reads in the region. + * @return a non-null genome loc + */ + @Ensures("result != null") + public GenomeLoc getReadSpanLoc() { return spanIncludingReads; } + /** + * Get the active profile states that went into creating this region, if possible + * @return an unmodifiable list of states that led to the creation of this region, or an empty + * list if none were provided + */ + @Ensures("result != null") + public List getSupportingStates() { + return Collections.unmodifiableList(supportingStates); + } + + /** + * Get the active region extension applied to this region + * + * The extension is >= 0 bp in size, and indicates how much padding this art walker wanted for its regions + * + * @return the size in bp of the region extension + */ + @Ensures("result >= 0") public int getExtension() { return extension; } - public int size() { return reads.size(); } - public void clearReads() { reads.clear(); } - public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } - public boolean equalExceptReads(final ActiveRegion other) { + /** + * Get an unmodifiable list of reads currently in this active region. + * + * The reads are sorted by their coordinate position + * + * @return an unmodifiable list of reads in this active region + */ + @Ensures("result != null") + public List getReads() { + return Collections.unmodifiableList(reads); + } + + /** + * Get the number of reads currently in this active region + * @return an integer >= 0 + */ + @Ensures("result >= 0") + public int size() { return reads.size(); } + + /** + * Add read to this active region + * + * Read must have alignment start >= than the last read currently in this active region. + * + * @throws IllegalArgumentException if read doesn't overlap the extended region of this active region + * + * @param read a non-null GATKSAMRecord + */ + @Ensures("reads.size() == old(reads.size()) + 1") + public void add( final GATKSAMRecord read ) { + if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); + + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); + if ( ! readOverlapsRegion(read) ) + throw new IllegalArgumentException("Read location " + readLoc + " doesn't overlap with active region extended span " + extendedLoc); + + spanIncludingReads = spanIncludingReads.union( readLoc ); + + if ( ! reads.isEmpty() ) { + final GATKSAMRecord lastRead = reads.get(size() - 1); + if ( ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) + throw new IllegalArgumentException("Attempting to add a read to ActiveRegion not on the same contig as other reads: lastRead " + lastRead + " attempting to add " + read); + + if ( read.getAlignmentStart() < lastRead.getAlignmentStart() ) + throw new IllegalArgumentException("Attempting to add a read to ActiveRegion out of order w.r.t. other reads: lastRead " + lastRead + " at " + lastRead.getAlignmentStart() + " attempting to add " + read + " at " + read.getAlignmentStart()); + } + + reads.add( read ); + } + + /** + * Returns true if read would overlap the extended extent of this region + * @param read the read we want to test + * @return true if read can be added to this region, false otherwise + */ + public boolean readOverlapsRegion(final GATKSAMRecord read) { + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); + return readLoc.overlapsP(extendedLoc); + } + + /** + * Add all reads to this active region + * @param reads a collection of reads to add to this active region + */ + public void addAll(final Collection reads) { + if ( reads == null ) throw new IllegalArgumentException("reads cannot be null"); + for ( final GATKSAMRecord read : reads ) + add(read); + } + + /** + * Clear all of the reads currently in this active region + */ + @Ensures("size() == 0") + public void clearReads() { + spanIncludingReads = extendedLoc; + reads.clear(); + } + + /** + * Remove all of the reads in readsToRemove from this active region + * @param readsToRemove the collection of reads we want to remove + */ + public void removeAll( final Collection readsToRemove ) { + reads.removeAll(readsToRemove); + spanIncludingReads = extendedLoc; + for ( final GATKSAMRecord read : reads ) { + spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) ); + } + } + + /** + * Clips all of the reads in this active region so that none extend beyond the active region extended loc + * + * This function may change the getReadSpanLoc, as it updates the read span based on the new clipped + * read coordinates. + */ + public void hardClipToActiveRegion() { + final ArrayList clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() ); + ReadUtils.sortReadsByCoordinate(clippedReads); + clearReads(); + addAll(clippedReads); + } + + /** + * Is this region equal to other, excluding any reads in either region in the comparison + * @param other the other active region we want to test + * @return true if this region is equal, excluding any reads and derived values, to other + */ + protected boolean equalExceptReads(final ActiveRegion other) { if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false; - if ( isActive != other.isActive ) return false; + if ( isActive() != other.isActive()) return false; if ( genomeLocParser != other.genomeLocParser ) return false; if ( extension != other.extension ) return false; if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; return true; } + + /** + * Does this region represent an active region (all isActiveProbs above threshold) or + * an inactive region (all isActiveProbs below threshold)? + */ + public boolean isActive() { + return isActive; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 0f5d6a2f7..1bf24814b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -115,6 +115,19 @@ public class ArtificialSAMUtils { return header; } + /** + * Creates an artificial sam header based on the sequence dictionary dict + * + * @return + */ + public static SAMFileHeader createArtificialSamHeader(final SAMSequenceDictionary dict) { + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate); + header.setSequenceDictionary(dict); + return header; + } + + /** * setup a default read group for a SAMFileHeader * diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java new file mode 100644 index 000000000..d2ea5d11b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.activeregion; + + +// the imports for unit testing. + + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + + +public class ActiveRegionUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + private IndexedFastaSequenceFile seq; + private String contig; + private int contigLength; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(seq); + contig = "1"; + contigLength = genomeLocParser.getContigInfo(contig).getSequenceLength(); + } + + @DataProvider(name = "ActionRegionCreationTest") + public Object[][] makePollingData() { + List tests = new ArrayList(); + for ( final int start : Arrays.asList(1, 10, 100, contigLength - 10, contigLength - 1) ) { + for ( final int size : Arrays.asList(1, 10, 100, 1000) ) { + for ( final int ext : Arrays.asList(0, 1, 10, 100) ) { + for ( final boolean isActive : Arrays.asList(true, false) ) { + for ( final boolean addStates : Arrays.asList(true, false) ) { + List states = null; + if ( addStates ) { + states = new LinkedList(); + for ( int i = start; i < start + size; i++ ) { + states.add(new ActivityProfileState(genomeLocParser.createGenomeLoc(contig, i + start), isActive ? 1.0 : 0.0)); + } + } + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, start, start + size - 1); + tests.add(new Object[]{loc, states, isActive, ext}); + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "ActionRegionCreationTest") + public void testCreatingActiveRegions(final GenomeLoc loc, final List supportingStates, final boolean isActive, final int extension) { + final ActiveRegion region = new ActiveRegion(loc, supportingStates, isActive, genomeLocParser, extension); + Assert.assertEquals(region.getLocation(), loc); + Assert.assertEquals(region.getExtendedLoc().getStart(), Math.max(loc.getStart() - extension, 1)); + Assert.assertEquals(region.getExtendedLoc().getStop(), Math.min(loc.getStop() + extension, contigLength)); + Assert.assertEquals(region.getReadSpanLoc().getStart(), Math.max(loc.getStart() - extension, 1)); + Assert.assertEquals(region.getReadSpanLoc().getStop(), Math.min(loc.getStop() + extension, contigLength)); + Assert.assertEquals(region.isActive(), isActive); + Assert.assertEquals(region.getExtension(), extension); + Assert.assertEquals(region.getReads(), Collections.emptyList()); + Assert.assertEquals(region.size(), 0); + Assert.assertEquals(region.getSupportingStates(), supportingStates == null ? Collections.emptyList() : supportingStates); + Assert.assertNotNull(region.toString()); + + assertGoodReferenceGetter(region.getActiveRegionReference(seq), region.getExtendedLoc(), 0); + assertGoodReferenceGetter(region.getActiveRegionReference(seq, 10), region.getExtendedLoc(), 10); + assertGoodReferenceGetter(region.getFullReference(seq), region.getReadSpanLoc(), 0); + assertGoodReferenceGetter(region.getFullReference(seq, 10), region.getReadSpanLoc(), 10); + } + + private void assertGoodReferenceGetter(final byte[] actualBytes, final GenomeLoc span, final int padding) { + final int expectedStart = Math.max(span.getStart() - padding, 1); + final int expectedStop = Math.min(span.getStop() + padding, contigLength); + final byte[] expectedBytes = seq.getSubsequenceAt(span.getContig(), expectedStart, expectedStop).getBases(); + Assert.assertEquals(actualBytes, expectedBytes); + } + + @DataProvider(name = "ActiveRegionReads") + public Object[][] makeActiveRegionReads() { + List tests = new ArrayList(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + for ( final int start : Arrays.asList(1, 10, 100, contigLength - 10, contigLength - 1) ) { + for ( final int readStartOffset : Arrays.asList(-100, -10, 0, 10, 100) ) { + for ( final int readSize : Arrays.asList(10, 100, 1000) ) { + final GenomeLoc loc = genomeLocParser.createGenomeLocOnContig(contig, start, start + 10); + + final int readStart = Math.max(start + readStartOffset, 1); + final int readStop = Math.min(readStart + readSize, contigLength); + final int readLength = readStop - readStart + 1; + if ( readLength > 0 ) { + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, readStart, readLength); + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); + if ( readLoc.overlapsP(loc) ) + tests.add(new Object[]{loc, read}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ActiveRegionReads") + public void testActiveRegionReads(final GenomeLoc loc, final GATKSAMRecord read) { + final GenomeLoc expectedSpan = loc.union(genomeLocParser.createGenomeLoc(read)); + + final ActiveRegion region = new ActiveRegion(loc, null, true, genomeLocParser, 0); + final ActiveRegion region2 = new ActiveRegion(loc, null, true, genomeLocParser, 0); + Assert.assertEquals(region.getReads(), Collections.emptyList()); + Assert.assertEquals(region.size(), 0); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), loc); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.add(read); + Assert.assertEquals(region.getReads(), Collections.singletonList(read)); + Assert.assertEquals(region.size(), 1); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), expectedSpan); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.clearReads(); + Assert.assertEquals(region.getReads(), Collections.emptyList()); + Assert.assertEquals(region.size(), 0); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), loc); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.addAll(Collections.singleton(read)); + Assert.assertEquals(region.getReads(), Collections.singletonList(read)); + Assert.assertEquals(region.size(), 1); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), expectedSpan); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.removeAll(Collections.emptyList()); + Assert.assertEquals(region.getReads(), Collections.singletonList(read)); + Assert.assertEquals(region.size(), 1); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), expectedSpan); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.removeAll(Collections.singletonList(read)); + Assert.assertEquals(region.getReads(), Collections.emptyList()); + Assert.assertEquals(region.size(), 0); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), loc); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.add(read); + region.hardClipToActiveRegion(); + Assert.assertEquals(region.size(), 1); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), loc); + Assert.assertTrue(region.getReads().get(0).getAlignmentStart() >= region.getExtendedLoc().getStart()); + Assert.assertTrue(region.getReads().get(0).getAlignmentEnd() <= region.getExtendedLoc().getStop()); + } + + @DataProvider(name = "BadReadsTest") + public Object[][] makeBadReadsTest() { + List tests = new ArrayList(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + tests.add(new Object[]{ + ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10), + ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 9, 10)}); + tests.add(new Object[]{ + ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10), + ArtificialSAMUtils.createArtificialRead(header, "read2", 1, 9, 10)}); + tests.add(new Object[]{ + ArtificialSAMUtils.createArtificialRead(header, "read1", 1, 10, 10), + ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 9, 10)}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BadReadsTest", expectedExceptions = IllegalArgumentException.class) + public void testBadReads(final GATKSAMRecord read1, final GATKSAMRecord read2) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(read1); + final ActiveRegion region = new ActiveRegion(loc, null, true, genomeLocParser, 0); + region.add(read1); + region.add(read2); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index 1df4a3348..b9fdb3afe 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -276,7 +276,7 @@ public class ActivityProfileUnitTest extends BaseTest { Assert.assertTrue(regionOffset >= 0 && regionOffset < probs.size(), "Region " + region + " has a bad offset w.r.t. start"); for ( int j = 0; j < region.getLocation().size(); j++ ) { final int siteOffset = j + regionOffset; - Assert.assertEquals(region.isActive, probs.get(siteOffset).booleanValue()); + Assert.assertEquals(region.isActive(), probs.get(siteOffset).booleanValue()); Assert.assertFalse(seenSites.get(siteOffset), "Site " + j + " in " + region + " was seen already"); seenSites.set(siteOffset, true); } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java index eb2eebd36..cb2a6bfb2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -309,8 +309,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest { lastPosSeen = region.getLocation().getStop(); for ( final ActivityProfileState state : region.getSupportingStates() ) { - Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive, - "Region is active=" + region.isActive + " but contains a state " + state + " with prob " + Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive(), + "Region is active=" + region.isActive() + " but contains a state " + state + " with prob " + state.isActiveProb + " not within expected values given threshold for activity of " + ActivityProfile.ACTIVE_PROB_THRESHOLD); } From 69dd5cc902122c432cb337a7219c7d28cdbbf456 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 29 Jan 2013 19:28:14 -0500 Subject: [PATCH 11/13] AutoFormattingTimeUnitTest should be in utils --- .../sting/{ => utils}/AutoFormattingTimeUnitTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) rename public/java/test/org/broadinstitute/sting/{ => utils}/AutoFormattingTimeUnitTest.java (98%) diff --git a/public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java similarity index 98% rename from public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java index b1aacd8b9..8bedfb547 100644 --- a/public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java @@ -23,8 +23,9 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting; +package org.broadinstitute.sting.utils; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.AutoFormattingTime; import org.testng.Assert; import org.testng.annotations.DataProvider; From 8562bfaae1a028a2966d5cbd3aff78a88b7ac4dc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 29 Jan 2013 08:10:56 -0500 Subject: [PATCH 12/13] Optimize GenomeLocParser.createGenomeLoc -- The new version is roughly 2x faster than the previous version. The key here was to cleanup the workflow for validateGenomeLoc and remove the now unnecessary synchronization blocks from the CachingSequencingDictionary, since these are now thread local variables -- #resolves https://jira.broadinstitute.org/browse/GSA-724 --- .../sting/utils/GenomeLocParser.java | 169 ++++++++++++------ .../sting/utils/GenomeLocParserBenchmark.java | 81 +++++++++ 2 files changed, 191 insertions(+), 59 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index e70182acf..36b23ae14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -45,6 +45,14 @@ import org.broadinstitute.variant.variantcontext.VariantContext; public final class GenomeLocParser { private static Logger logger = Logger.getLogger(GenomeLocParser.class); + /** + * How much validation should we do at runtime with this parser? + */ + public enum ValidationLevel { + STANDARD, + NONE + } + // -------------------------------------------------------------------------------------------------------------- // // Ugly global variable defining the optional ordering of contig elements @@ -58,22 +66,25 @@ public final class GenomeLocParser { final private SAMSequenceDictionary SINGLE_MASTER_SEQUENCE_DICTIONARY; /** - * A thread-local caching contig info + * A thread-local CachingSequenceDictionary */ private final ThreadLocal contigInfoPerThread = - new ThreadLocal(); + new ThreadLocal() { + @Override + protected CachingSequenceDictionary initialValue() { + return new CachingSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY); + } + }; + + /** + * How much validation are we doing at runtime with this GenomeLocParser? + */ + private final ValidationLevel validationLevel; /** * @return a caching sequence dictionary appropriate for this thread */ private CachingSequenceDictionary getContigInfo() { - if ( contigInfoPerThread.get() == null ) { - // initialize for this thread - contigInfoPerThread.set(new CachingSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY)); - } - - assert contigInfoPerThread.get() != null; - return contigInfoPerThread.get(); } @@ -94,24 +105,28 @@ public final class GenomeLocParser { this.dict = dict; } + public final String internContigName(final String contig) { + return getContigInfo(contig).getSequenceName(); + } + @Ensures("result > 0") public final int getNSequences() { return dict.size(); } @Requires("contig != null") - public final synchronized boolean hasContig(final String contig) { + public final boolean hasContig(final String contig) { return contig.equals(lastContig) || dict.getSequence(contig) != null; } @Requires("index >= 0") - public final synchronized boolean hasContig(final int index) { + public final boolean hasContig(final int index) { return lastIndex == index || dict.getSequence(index) != null; } @Requires("contig != null") @Ensures("result != null") - public synchronized final SAMSequenceRecord getSequence(final String contig) { + public final SAMSequenceRecord getSequence(final String contig) { if ( isCached(contig) ) return lastSSR; else @@ -120,7 +135,7 @@ public final class GenomeLocParser { @Requires("index >= 0") @Ensures("result != null") - public synchronized final SAMSequenceRecord getSequence(final int index) { + public final SAMSequenceRecord getSequence(final int index) { if ( isCached(index) ) return lastSSR; else @@ -129,7 +144,7 @@ public final class GenomeLocParser { @Requires("contig != null") @Ensures("result >= 0") - public synchronized final int getSequenceIndex(final String contig) { + public final int getSequenceIndex(final String contig) { if ( ! isCached(contig) ) { updateCache(contig, -1); } @@ -138,12 +153,12 @@ public final class GenomeLocParser { } @Requires({"contig != null", "lastContig != null"}) - private synchronized boolean isCached(final String contig) { + private boolean isCached(final String contig) { return lastContig.equals(contig); } @Requires({"lastIndex != -1", "index >= 0"}) - private synchronized boolean isCached(final int index) { + private boolean isCached(final int index) { return lastIndex == index; } @@ -157,7 +172,7 @@ public final class GenomeLocParser { */ @Requires("contig != null || index >= 0") @Ensures("result != null") - private synchronized SAMSequenceRecord updateCache(final String contig, int index ) { + private SAMSequenceRecord updateCache(final String contig, int index ) { SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig); if ( rec == null ) { throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index); @@ -168,8 +183,6 @@ public final class GenomeLocParser { return rec; } } - - } /** @@ -181,16 +194,32 @@ public final class GenomeLocParser { this(refFile.getSequenceDictionary()); } + /** + * Create a new GenomeLocParser based on seqDictionary with the standard validation level + * @param seqDict a non-null sequence dictionary + */ public GenomeLocParser(SAMSequenceDictionary seqDict) { + this(seqDict, ValidationLevel.STANDARD); + } + + /** + * Create a genome loc parser based on seqDict with the specified level of validation + * @param seqDict the sequence dictionary to use when creating genome locs + * @param validationLevel how much validation should we do of the genome locs at runtime? + */ + public GenomeLocParser(SAMSequenceDictionary seqDict, final ValidationLevel validationLevel) { + this.validationLevel = validationLevel; if (seqDict == null) { // we couldn't load the reference dictionary //logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs"); throw new UserException.CommandLineException("Failed to load reference dictionary"); } SINGLE_MASTER_SEQUENCE_DICTIONARY = seqDict; - logger.debug(String.format("Prepared reference sequence contig dictionary")); - for (SAMSequenceRecord contig : seqDict.getSequences()) { - logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); + if ( logger.isDebugEnabled() ) { + logger.debug(String.format("Prepared reference sequence contig dictionary")); + for (SAMSequenceRecord contig : seqDict.getSequences()) { + logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); + } } } @@ -283,13 +312,23 @@ public final class GenomeLocParser { @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop,mustBeOnReference)"}) public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop, boolean mustBeOnReference) { - validateGenomeLoc(contig, index, start, stop, mustBeOnReference, true); - return new GenomeLoc(contig, index, start, stop); + // optimization: by interning the string we ensure that future comparisons use == not the full string comp + final String interned = validateGenomeLoc(contig, index, start, stop, mustBeOnReference); + return new GenomeLoc(interned, index, start, stop); } + /** + * Create a new genome loc, bounding start and stop by the start and end of contig + * @param contig our contig + * @param start our start as an arbitrary integer (may be negative, etc) + * @param stop our stop as an arbitrary integer (may be negative, etc) + * @throws ReviewedStingException if there's no way to create a meaningful genome loc given contig, start, and stop + * @return a valid genome loc over contig + */ public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) { - GenomeLoc contigLoc = createOverEntireContig(contig); - return new GenomeLoc(contig, getContigIndex(contig), start, stop).intersect(contigLoc); + final GenomeLoc myLoc = createGenomeLoc(contig, start, stop); + final GenomeLoc contigLoc = createOverEntireContig(contig); + return myLoc.intersect(contigLoc); } /** @@ -306,50 +345,62 @@ public final class GenomeLocParser { * @param start the start position * @param stop the stop position * - * @return true if it's valid, false otherwise. If exceptOnError, then throws a UserException if invalid + * @return the interned contig name, an optimization that ensures that contig == the string in the sequence dictionary */ - private boolean validateGenomeLoc(String contig, int contigIndex, int start, int stop, boolean mustBeOnReference, boolean exceptOnError) { - if ( ! getContigInfo().hasContig(contig) ) - return vglHelper(exceptOnError, String.format("Unknown contig %s", contig)); + protected String validateGenomeLoc(final String contig, final int contigIndex, final int start, final int stop, final boolean mustBeOnReference) { + if ( validationLevel == ValidationLevel.NONE ) + return contig; + else { + if (stop < start) + vglHelper(String.format("The stop position %d is less than start %d in contig %s", stop, start, contig)); - if (stop < start) - return vglHelper(exceptOnError, String.format("The stop position %d is less than start %d in contig %s", stop, start, contig)); + final SAMSequenceRecord contigInfo = getContigInfo().getSequence(contig); + if ( contigInfo.getSequenceIndex() != contigIndex ) + vglHelper(String.format("The contig index %d is bad, doesn't equal the contig index %d of the contig from a string %s", + contigIndex, contigInfo.getSequenceIndex(), contig)); - if (contigIndex < 0) - return vglHelper(exceptOnError, String.format("The contig index %d is less than 0", contigIndex)); + if ( mustBeOnReference ) { + if (start < 1) + vglHelper(String.format("The start position %d is less than 1", start)); - if (contigIndex >= getContigInfo().getNSequences()) - return vglHelper(exceptOnError, String.format("The contig index %d is greater than the stored sequence count (%d)", contigIndex, getContigInfo().getNSequences())); + if (stop < 1) + vglHelper(String.format("The stop position %d is less than 1", stop)); - if ( mustBeOnReference ) { - if (start < 1) - return vglHelper(exceptOnError, String.format("The start position %d is less than 1", start)); + final int contigSize = contigInfo.getSequenceLength(); + if (start > contigSize || stop > contigSize) + vglHelper(String.format("The genome loc coordinates %d-%d exceed the contig size (%d)", start, stop, contigSize)); + } - if (stop < 1) - return vglHelper(exceptOnError, String.format("The stop position %d is less than 1", stop)); - - int contigSize = getContigInfo().getSequence(contigIndex).getSequenceLength(); - if (start > contigSize || stop > contigSize) - return vglHelper(exceptOnError, String.format("The genome loc coordinates %d-%d exceed the contig size (%d)", start, stop, contigSize)); + return contigInfo.getSequenceName(); } - - // we passed - return true; } + /** + * Would a genome loc created with the given parameters be valid w.r.t. the master sequence dictionary? + * @param contig the contig we'd use + * @param start the start position + * @param stop the stop + * @param mustBeOnReference should we require the resulting genome loc to be completely on the reference genome? + * @return true if this would produce a valid genome loc, false otherwise + */ public boolean isValidGenomeLoc(String contig, int start, int stop, boolean mustBeOnReference ) { - return validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, mustBeOnReference, false); - } - - public boolean isValidGenomeLoc(String contig, int start, int stop ) { - return validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, true, false); - } - - private boolean vglHelper(boolean exceptOnError, String msg) { - if ( exceptOnError ) - throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect:" + msg); - else + try { + validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, mustBeOnReference); + return true; + } catch ( ReviewedStingException e) { return false; + } + } + + /** + * @see #isValidGenomeLoc(String, int, int) with mustBeOnReference == true + */ + public boolean isValidGenomeLoc(String contig, int start, int stop ) { + return isValidGenomeLoc(contig, start, stop, true); + } + + private void vglHelper(final String msg) { + throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect:" + msg); } // -------------------------------------------------------------------------------------------------------------- diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java new file mode 100644 index 000000000..478f02530 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; + +import java.io.File; + +/** + * Caliper microbenchmark of genome loc parser + */ +public class GenomeLocParserBenchmark extends SimpleBenchmark { + private IndexedFastaSequenceFile seq; + private final int ITERATIONS = 1000000; + + @Param({"NEW", "NONE"}) + GenomeLocParser.ValidationLevel validationLevel; // set automatically by framework + + @Param({"true", "false"}) + boolean useContigIndex; // set automatically by framework + + @Override protected void setUp() throws Exception { + seq = new CachingIndexedFastaSequenceFile(new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")); + } +// +// public void timeSequentialCreationFromGenomeLoc(int rep) { +// final GenomeLocParser genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary(), validationLevel); +// GenomeLoc last = genomeLocParser.createGenomeLoc("1", 1, 1); +// for ( int i = 0; i < rep; i++ ) { +// for ( int j = 1; j < ITERATIONS; j++ ) { +// if ( useContigIndex ) +// last = genomeLocParser.createGenomeLoc(last.getContig(), last.getContigIndex(), last.getStart() + 1); +// else +// last = genomeLocParser.createGenomeLoc(last.getContig(), last.getStart() + 1); +// } +// } +// } +// +// public void timeSequentialCreationFromGenomeLocOriginal(int rep) { +// final GenomeLocParserOriginal genomeLocParser = new GenomeLocParserOriginal(seq.getSequenceDictionary()); +// GenomeLoc last = genomeLocParser.createGenomeLoc("1", 1, 1); +// for ( int i = 0; i < rep; i++ ) { +// for ( int j = 1; j < ITERATIONS; j++ ) { +// if ( useContigIndex ) +// last = genomeLocParser.createGenomeLoc(last.getContig(), last.getContigIndex(), last.getStart() + 1); +// else +// last = genomeLocParser.createGenomeLoc(last.getContig(), last.getStart() + 1); +// } +// } +// } + + public static void main(String[] args) { + com.google.caliper.Runner.main(GenomeLocParserBenchmark.class, args); + } +} From 45603f58cd285db612ca3af9bacab43310c2722a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 29 Jan 2013 16:51:39 -0500 Subject: [PATCH 13/13] Refactoring and unit testing GenomeLocParser -- Moved previously inner class to MRUCachingSAMSequenceDictionary, and unit test to 100% coverage -- Fully document all functions in GenomeLocParser -- Unit tests for things like parsePosition (shocking it wasn't tested!) -- Removed function to specifically create GenomeLocs for VariantContexts. The fact that you must incorporate END attributes in the context means that createGenomeLoc(Feature) works correctly -- Depreciated (and moved functionality) of setStart, setStop, and incPos to GenomeLoc -- Unit test coverage at like 80%, moving to 100% with next commit --- .../evaluators/VariantSummary.java | 2 +- .../IntervalStratification.java | 2 +- .../broadinstitute/sting/utils/GenomeLoc.java | 51 +++ .../sting/utils/GenomeLocParser.java | 332 ++++++------------ .../MRUCachingSAMSequenceDictionary.java | 186 ++++++++++ .../sting/utils/GenomeLocParserUnitTest.java | 181 +++++++++- ...achingSAMSequencingDictionaryUnitTest.java | 97 +++++ 7 files changed, 627 insertions(+), 224 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java index a3b703ad3..2a1dbd277 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java @@ -193,7 +193,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { private boolean overlapsKnownCNV(VariantContext cnv) { if ( knownCNVs != null ) { - final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv, true); + final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv); IntervalTree intervalTree = knownCNVs.get(loc.getContig()); final Iterator> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java index be689fe55..312e506a2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java @@ -77,7 +77,7 @@ public class IntervalStratification extends VariantStratifier { public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (eval != null) { - final GenomeLoc loc = getVariantEvalWalker().getToolkit().getGenomeLocParser().createGenomeLoc(eval, true); + final GenomeLoc loc = getVariantEvalWalker().getToolkit().getGenomeLocParser().createGenomeLoc(eval); IntervalTree intervalTree = intervalTreeByContig.get(loc.getContig()); IntervalTree.Node node = intervalTree.minOverlapper(loc.getStart(), loc.getStop()); //logger.info(String.format("Overlap %s found %s", loc, node)); diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index c81e8e853..4f1b35f62 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -530,4 +530,55 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome final int cmp = this.compareTo(other); return cmp == -1 ? other : this; } + + /** + * create a new genome loc from an existing loc, with a new start position + * Note that this function will NOT explicitly check the ending offset, in case someone wants to + * set the start of a new GenomeLoc pertaining to a read that goes off the end of the contig. + * + * @param loc the old location + * @param start a new start position + * + * @return a newly allocated GenomeLoc as loc but with start == start + */ + public GenomeLoc setStart(GenomeLoc loc, int start) { + return new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop()); + } + + /** + * create a new genome loc from an existing loc, with a new stop position + * Note that this function will NOT explicitly check the ending offset, in case someone wants to + * set the stop of a new GenomeLoc pertaining to a read that goes off the end of the contig. + * + * @param loc the old location + * @param stop a new stop position + * + * @return a newly allocated GenomeLoc as loc but with stop == stop + */ + public GenomeLoc setStop(GenomeLoc loc, int stop) { + return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop); + } + + /** + * return a new genome loc, with an incremented position + * + * @param loc the old location + * + * @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + 1 + */ + public GenomeLoc incPos(GenomeLoc loc) { + return incPos(loc, 1); + } + + /** + * return a new genome loc, with an incremented position + * + * @param loc the old location + * @param by how much to move the start and stop by + * + * @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + by + */ + public GenomeLoc incPos(GenomeLoc loc, int by) { + return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 36b23ae14..61478744d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -34,10 +34,8 @@ import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; -import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.VariantContext; /** * Factory class for creating GenomeLocs @@ -49,7 +47,9 @@ public final class GenomeLocParser { * How much validation should we do at runtime with this parser? */ public enum ValidationLevel { + /** Do the standard amount of validation */ STANDARD, + /** Don't do any real checking at all */ NONE } @@ -68,11 +68,11 @@ public final class GenomeLocParser { /** * A thread-local CachingSequenceDictionary */ - private final ThreadLocal contigInfoPerThread = - new ThreadLocal() { + private final ThreadLocal contigInfoPerThread = + new ThreadLocal() { @Override - protected CachingSequenceDictionary initialValue() { - return new CachingSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY); + protected MRUCachingSAMSequenceDictionary initialValue() { + return new MRUCachingSAMSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY); } }; @@ -84,107 +84,10 @@ public final class GenomeLocParser { /** * @return a caching sequence dictionary appropriate for this thread */ - private CachingSequenceDictionary getContigInfo() { + private MRUCachingSAMSequenceDictionary getContigInfo() { return contigInfoPerThread.get(); } - /** - * A wrapper class that provides efficient last used caching for the global - * SAMSequenceDictionary underlying all of the GATK engine capabilities. - */ - private final class CachingSequenceDictionary { - final private SAMSequenceDictionary dict; - - // cache - SAMSequenceRecord lastSSR = null; - String lastContig = ""; - int lastIndex = -1; - - @Requires({"dict != null", "dict.size() > 0"}) - public CachingSequenceDictionary(SAMSequenceDictionary dict) { - this.dict = dict; - } - - public final String internContigName(final String contig) { - return getContigInfo(contig).getSequenceName(); - } - - @Ensures("result > 0") - public final int getNSequences() { - return dict.size(); - } - - @Requires("contig != null") - public final boolean hasContig(final String contig) { - return contig.equals(lastContig) || dict.getSequence(contig) != null; - } - - @Requires("index >= 0") - public final boolean hasContig(final int index) { - return lastIndex == index || dict.getSequence(index) != null; - } - - @Requires("contig != null") - @Ensures("result != null") - public final SAMSequenceRecord getSequence(final String contig) { - if ( isCached(contig) ) - return lastSSR; - else - return updateCache(contig, -1); - } - - @Requires("index >= 0") - @Ensures("result != null") - public final SAMSequenceRecord getSequence(final int index) { - if ( isCached(index) ) - return lastSSR; - else - return updateCache(null, index); - } - - @Requires("contig != null") - @Ensures("result >= 0") - public final int getSequenceIndex(final String contig) { - if ( ! isCached(contig) ) { - updateCache(contig, -1); - } - - return lastIndex; - } - - @Requires({"contig != null", "lastContig != null"}) - private boolean isCached(final String contig) { - return lastContig.equals(contig); - } - - @Requires({"lastIndex != -1", "index >= 0"}) - private boolean isCached(final int index) { - return lastIndex == index; - } - - /** - * The key algorithm. Given a new record, update the last used record, contig - * name, and index. - * - * @param contig - * @param index - * @return - */ - @Requires("contig != null || index >= 0") - @Ensures("result != null") - private SAMSequenceRecord updateCache(final String contig, int index ) { - SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig); - if ( rec == null ) { - throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index); - } else { - lastSSR = rec; - lastContig = rec.getSequenceName(); - lastIndex = rec.getSequenceIndex(); - return rec; - } - } - } - /** * set our internal reference contig order * @param refFile the reference file @@ -205,16 +108,18 @@ public final class GenomeLocParser { /** * Create a genome loc parser based on seqDict with the specified level of validation * @param seqDict the sequence dictionary to use when creating genome locs - * @param validationLevel how much validation should we do of the genome locs at runtime? + * @param validationLevel how much validation should we do of the genome locs at runtime? Purely for testing purposes */ - public GenomeLocParser(SAMSequenceDictionary seqDict, final ValidationLevel validationLevel) { - this.validationLevel = validationLevel; + protected GenomeLocParser(SAMSequenceDictionary seqDict, final ValidationLevel validationLevel) { + if (validationLevel == null) + throw new IllegalArgumentException("validation level cannot be null"); if (seqDict == null) { // we couldn't load the reference dictionary //logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs"); throw new UserException.CommandLineException("Failed to load reference dictionary"); } - SINGLE_MASTER_SEQUENCE_DICTIONARY = seqDict; + this.validationLevel = validationLevel; + this.SINGLE_MASTER_SEQUENCE_DICTIONARY = seqDict; if ( logger.isDebugEnabled() ) { logger.debug(String.format("Prepared reference sequence contig dictionary")); for (SAMSequenceRecord contig : seqDict.getSequences()) { @@ -227,17 +132,13 @@ public final class GenomeLocParser { * Determines whether the given contig is valid with respect to the sequence dictionary * already installed in the GenomeLoc. * + * @param contig a potentially null string name for the contig * @return True if the contig is valid. False otherwise. */ - public final boolean contigIsInDictionary(String contig) { + public final boolean contigIsInDictionary(final String contig) { return contig != null && getContigInfo().hasContig(contig); } - public final boolean indexIsInDictionary(final int index) { - return index >= 0 && getContigInfo().hasContig(index); - } - - /** * get the contig's SAMSequenceRecord * @@ -278,7 +179,7 @@ public final class GenomeLocParser { * @return */ public final SAMSequenceDictionary getContigs() { - return getContigInfo().dict; + return getContigInfo().getDictionary(); } // -------------------------------------------------------------------------------------------------------------- @@ -286,14 +187,13 @@ public final class GenomeLocParser { // Low-level creation functions // // -------------------------------------------------------------------------------------------------------------- + /** - * create a genome loc, given the contig name, start, and stop + * @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation. * - * @param contig the contig name - * @param start the starting position - * @param stop the stop position - * - * @return a new genome loc + * Note that because this function doesn't take the contig index as an argument for contig, it + * has a slight performance penalty over the version that does take the contig index. Does not + * require the created genome loc on the reference genome */ @Ensures("result != null") @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop)"}) @@ -301,34 +201,61 @@ public final class GenomeLocParser { return createGenomeLoc(contig, getContigIndex(contig), start, stop); } - public GenomeLoc createGenomeLoc(String contig, final int start, final int stop, boolean mustBeOnReference) { + /** + * @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation. + * + * Note that because this function doesn't take the contig index as an argument for contig, it + * has a slight performance penalty over the version that does take the contig index. + */ + public GenomeLoc createGenomeLoc(final String contig, final int start, final int stop, boolean mustBeOnReference) { return createGenomeLoc(contig, getContigIndex(contig), start, stop, mustBeOnReference); } + /** + * @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation. + * + * Doesn't require the start and stop to be on the genome + */ @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop, false)"}) public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop) { return createGenomeLoc(contig, index, start, stop, false); } + /** + * Create a GenomeLoc on contig, starting at start and ending (inclusive) at stop. + * + * @param contig the contig name + * @param index the index into the GATK's SAMSequencingDictionary of contig (passed for efficiency to avoid the lookup) + * @param start the starting position + * @param stop the stop position of this loc, inclusive + * @param mustBeOnReference if true, this factory will throw a UserException.MalformedGenomeLoc if start or stop isn't on the contig + * + * @return a non-null GenomeLoc + */ @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop,mustBeOnReference)"}) - public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop, boolean mustBeOnReference) { + @Ensures("result != null") + public GenomeLoc createGenomeLoc(final String contig, int index, final int start, final int stop, boolean mustBeOnReference) { // optimization: by interning the string we ensure that future comparisons use == not the full string comp final String interned = validateGenomeLoc(contig, index, start, stop, mustBeOnReference); return new GenomeLoc(interned, index, start, stop); } /** - * Create a new genome loc, bounding start and stop by the start and end of contig - * @param contig our contig - * @param start our start as an arbitrary integer (may be negative, etc) - * @param stop our stop as an arbitrary integer (may be negative, etc) - * @throws ReviewedStingException if there's no way to create a meaningful genome loc given contig, start, and stop - * @return a valid genome loc over contig + * Create a new GenomeLoc, on contig, including the single position pos. + * + * Pos is not required to be on the reference + * + * @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation. + * + * @param contig the contig name + * @param pos the start and stop of the created genome loc + * + * @return a genome loc representing a single base at the specified postion on the contig */ - public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) { - final GenomeLoc myLoc = createGenomeLoc(contig, start, stop); - final GenomeLoc contigLoc = createOverEntireContig(contig); - return myLoc.intersect(contigLoc); + @Ensures("result != null") + @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, pos, pos, true)"}) + public GenomeLoc createGenomeLoc(final String contig, final int pos) { + return createGenomeLoc(contig, getContigIndex(contig), pos, pos); } /** @@ -472,7 +399,7 @@ public final class GenomeLocParser { */ @Requires("pos != null") @Ensures("result >= 0") - private int parsePosition(final String pos) { + protected int parsePosition(final String pos) { if(pos.indexOf('-') != -1) { throw new NumberFormatException("Position: '" + pos + "' can't contain '-'." ); } @@ -533,89 +460,34 @@ public final class GenomeLocParser { } /** - * Creates a GenomeLoc corresponding to the variant context vc. If includeSymbolicEndIfPossible - * is true, and VC is a symbolic allele the end of the created genome loc will be the value - * of the END info field key, if it exists, or vc.getEnd() if not. - * - * @param vc - * @param includeSymbolicEndIfPossible - * @return + * @see GenomeLoc.setStart */ - public GenomeLoc createGenomeLoc(final VariantContext vc, boolean includeSymbolicEndIfPossible) { - if ( includeSymbolicEndIfPossible && vc.isSymbolic() ) { - int end = vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getEnd()); - return createGenomeLoc(vc.getChr(), vc.getStart(), end); - } - else - return createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd()); - } - - public GenomeLoc createGenomeLoc(final VariantContext vc) { - return createGenomeLoc(vc, false); - } - - /** - * create a new genome loc, given the contig name, and a single position. Must be on the reference - * - * @param contig the contig name - * @param pos the postion - * - * @return a genome loc representing a single base at the specified postion on the contig - */ - @Ensures("result != null") - @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, pos, pos, true)"}) - public GenomeLoc createGenomeLoc(final String contig, final int pos) { - return createGenomeLoc(contig, getContigIndex(contig), pos, pos); - } - - /** - * create a new genome loc from an existing loc, with a new start position - * Note that this function will NOT explicitly check the ending offset, in case someone wants to - * set the start of a new GenomeLoc pertaining to a read that goes off the end of the contig. - * - * @param loc the old location - * @param start a new start position - * - * @return the newly created genome loc - */ - public GenomeLoc setStart(GenomeLoc loc, int start) { + @Deprecated + public GenomeLoc setStart(final GenomeLoc loc, final int start) { return createGenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop()); } /** - * create a new genome loc from an existing loc, with a new stop position - * Note that this function will NOT explicitly check the ending offset, in case someone wants to - * set the stop of a new GenomeLoc pertaining to a read that goes off the end of the contig. - * - * @param loc the old location - * @param stop a new stop position - * - * @return + * @see GenomeLoc.setStop */ - public GenomeLoc setStop(GenomeLoc loc, int stop) { + @Deprecated + public GenomeLoc setStop(final GenomeLoc loc, final int stop) { return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop); } /** - * return a new genome loc, with an incremented position - * - * @param loc the old location - * - * @return a new genome loc + * @see GenomeLoc.incPos */ - public GenomeLoc incPos(GenomeLoc loc) { + @Deprecated + public GenomeLoc incPos(final GenomeLoc loc) { return incPos(loc, 1); } /** - * return a new genome loc, with an incremented position - * - * @param loc the old location - * @param by how much to move the start and stop by - * - * @return a new genome loc + * @see GenomeLoc.incPos */ - public GenomeLoc incPos(GenomeLoc loc, int by) { + @Deprecated + public GenomeLoc incPos(final GenomeLoc loc, final int by) { return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by); } @@ -626,7 +498,7 @@ public final class GenomeLocParser { */ @Requires("contigName != null") @Ensures("result != null") - public GenomeLoc createOverEntireContig(String contigName) { + public GenomeLoc createOverEntireContig(final String contigName) { SAMSequenceRecord contig = getContigInfo().getSequence(contigName); return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true); } @@ -638,12 +510,12 @@ public final class GenomeLocParser { * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig. */ @Requires({"loc != null", "maxBasePairs > 0"}) - public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) { + public GenomeLoc createGenomeLocAtStart(final GenomeLoc loc, final int maxBasePairs) { if (GenomeLoc.isUnmapped(loc)) return null; - String contigName = loc.getContig(); - SAMSequenceRecord contig = getContigInfo().getSequence(contigName); - int contigIndex = contig.getSequenceIndex(); + final String contigName = loc.getContig(); + final SAMSequenceRecord contig = getContigInfo().getSequence(contigName); + final int contigIndex = contig.getSequenceIndex(); int start = loc.getStart() - maxBasePairs; int stop = loc.getStart() - 1; @@ -662,19 +534,12 @@ public final class GenomeLocParser { * @param padding The number of base pairs to pad on either end * @return The contiguous loc of length up to the original length + 2*padding (depending on the start/end of the contig). */ - @Requires({"loc != null", "padding > 0"}) + @Requires({"loc != null", "padding >= 0"}) public GenomeLoc createPaddedGenomeLoc(final GenomeLoc loc, final int padding) { - if (GenomeLoc.isUnmapped(loc)) + if (GenomeLoc.isUnmapped(loc) || padding == 0) return loc; - final String contigName = loc.getContig(); - final SAMSequenceRecord contig = getContigInfo().getSequence(contigName); - final int contigIndex = contig.getSequenceIndex(); - final int contigLength = contig.getSequenceLength(); - - final int start = Math.max(1, loc.getStart() - padding); - final int stop = Math.min(contigLength, loc.getStop() + padding); - - return createGenomeLoc(contigName, contigIndex, start, stop, true); + else + return createGenomeLocOnContig(loc.getContig(), loc.getContigIndex(), loc.getStart() - padding, loc.getStop() + padding); } /** @@ -684,7 +549,7 @@ public final class GenomeLocParser { * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig. */ @Requires({"loc != null", "maxBasePairs > 0"}) - public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) { + public GenomeLoc createGenomeLocAtStop(final GenomeLoc loc, final int maxBasePairs) { if (GenomeLoc.isUnmapped(loc)) return null; String contigName = loc.getContig(); @@ -702,4 +567,35 @@ public final class GenomeLocParser { return createGenomeLoc(contigName, contigIndex, start, stop, true); } + + /** + * @see #createGenomeLocOnContig(String, int, int, int) with the contig index looked up from contig + */ + public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) { + return createGenomeLocOnContig(contig, getContigIndex(contig), start, stop); + } + + /** + * Create a new genome loc, bounding start and stop by the start and end of contig + * + * This function will return null if start and stop cannot be adjusted in any reasonable way + * to be on the contig. For example, if start and stop are both past the end of the contig, + * there's no way to fix this, and null will be returned. + * + * @param contig our contig + * @param start our start as an arbitrary integer (may be negative, etc) + * @param stop our stop as an arbitrary integer (may be negative, etc) + * @return a valid genome loc over contig, or null if a meaningful genome loc cannot be created + */ + public GenomeLoc createGenomeLocOnContig(final String contig, final int contigIndex, final int start, final int stop) { + final int contigLength = getContigInfo().getSequence(contigIndex).getSequenceLength(); + final int boundedStart = Math.max(1, start); + final int boundedStop = Math.min(contigLength, stop); + + if ( boundedStart > contigLength || boundedStop < 1 ) + // there's no meaningful way to create this genome loc, as the start and stop are off the contig + return null; + else + return createGenomeLoc(contig, contigIndex, boundedStart, boundedStop); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java b/public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java new file mode 100644 index 000000000..c11aeb730 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * A wrapper class that provides efficient most recently used caching for the global + * SAMSequenceDictionary underlying all of the GATK engine capabilities. It is essential + * that these class be as efficient as possible. It doesn't need to be thread-safe, as + * GenomeLocParser uses a thread-local variable to ensure that each thread gets its own MRU + * cache. + * + * The MRU elements are the SAMSequenceRecord, the lastContig, and the lastIndex. The + * cached value is the actual SAMSequenceRecord of the most recently accessed value from + * getSequence, along with local variables for the contig index and contig string. + */ +final class MRUCachingSAMSequenceDictionary { + /** + * Our sequence dictionary + */ + private final SAMSequenceDictionary dict; + + SAMSequenceRecord lastSSR = null; + String lastContig = ""; + int lastIndex = -1; + + /** + * Create a new MRUCachingSAMSequenceDictionary that provides information about sequences in dict + * @param dict a non-null, non-empty sequencing dictionary + */ + @Ensures("lastSSR == null") + public MRUCachingSAMSequenceDictionary(final SAMSequenceDictionary dict) { + if ( dict == null ) throw new IllegalArgumentException("Dictionary cannot be null"); + if ( dict.size() == 0 ) throw new IllegalArgumentException("Dictionary cannot have size zero"); + + this.dict = dict; + } + + /** + * Get our sequence dictionary + * @return a non-null SAMSequenceDictionary + */ + @Ensures("result != null") + public SAMSequenceDictionary getDictionary() { + return dict; + } + + /** + * Is contig present in the dictionary? Efficiently caching. + * @param contig a non-null contig we want to test + * @return true if contig is in dictionary, false otherwise + */ + @Requires("contig != null") + public final boolean hasContig(final String contig) { + return contig.equals(lastContig) || dict.getSequence(contig) != null; + } + + /** + * Is contig index present in the dictionary? Efficiently caching. + * @param contigIndex an integer offset that might map to a contig in this dictionary + * @return true if contigIndex is in dictionary, false otherwise + */ + @Requires("contigIndex >= 0") + public final boolean hasContigIndex(final int contigIndex) { + return lastIndex == contigIndex || dict.getSequence(contigIndex) != null; + } + + /** + * Same as SAMSequenceDictionary.getSequence but uses a MRU cache for efficiency + * + * @param contig the contig name we want to get the sequence record of + * @throws ReviewedStingException if contig isn't present in the dictionary + * @return the sequence record for contig + */ + @Requires("contig != null") + @Ensures("result != null") + public final SAMSequenceRecord getSequence(final String contig) { + if ( isCached(contig) ) + return lastSSR; + else + return updateCache(contig, -1); + } + + /** + * Same as SAMSequenceDictionary.getSequence but uses a MRU cache for efficiency + * + * @param index the contig index we want to get the sequence record of + * @throws ReviewedStingException if contig isn't present in the dictionary + * @return the sequence record for contig + */ + @Requires("index >= 0") + @Ensures("result != null") + public final SAMSequenceRecord getSequence(final int index) { + if ( isCached(index) ) + return lastSSR; + else + return updateCache(null, index); + } + + /** + * Same as SAMSequenceDictionary.getSequenceIndex but uses a MRU cache for efficiency + * + * @param contig the contig we want to get the sequence record of + * @throws ReviewedStingException if index isn't present in the dictionary + * @return the sequence record index for contig + */ + @Requires("contig != null") + @Ensures("result >= 0") + public final int getSequenceIndex(final String contig) { + if ( ! isCached(contig) ) { + updateCache(contig, -1); + } + + return lastIndex; + } + + /** + * Is contig the MRU cached contig? + * @param contig the contig to test + * @return true if contig is the currently cached contig, false otherwise + */ + @Requires({"contig != null"}) + protected boolean isCached(final String contig) { + return contig.equals(lastContig); + } + + /** + * Is the contig index index the MRU cached index? + * @param index the contig index to test + * @return true if contig index is the currently cached contig index, false otherwise + */ + protected boolean isCached(final int index) { + return lastIndex == index; + } + + /** + * The key algorithm. Given a new record, update the last used record, contig + * name, and index. + * + * @param contig the contig we want to look up. If null, index is used instead + * @param index the contig index we want to look up. Only used if contig is null + * @throws ReviewedStingException if index isn't present in the dictionary + * @return the SAMSequenceRecord for contig / index + */ + @Requires("contig != null || index >= 0") + @Ensures("result != null") + private SAMSequenceRecord updateCache(final String contig, int index ) { + SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig); + if ( rec == null ) { + throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index); + } else { + lastSSR = rec; + lastContig = rec.getSequenceName(); + lastIndex = rec.getSequenceIndex(); + return rec; + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index 4a989b984..9621aecda 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -29,17 +29,31 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import org.broad.tribble.BasicFeature; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; - -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + /** * @author aaron *

@@ -49,10 +63,11 @@ import org.testng.annotations.Test; */ public class GenomeLocParserUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; + private SAMFileHeader header; @BeforeClass public void init() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } @@ -231,7 +246,16 @@ public class GenomeLocParserUnitTest extends BaseTest { assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop + assertTrue( genomeLocParser.isValidGenomeLoc("chr1",-1,2, false)); // bad stop assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end + assertTrue( genomeLocParser.isValidGenomeLoc("chr1",10,11, false)); // bad start, past end + assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",2,1)); // stop < start + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testValidateGenomeLoc() { + // bad contig index + genomeLocParser.validateGenomeLoc("chr1", 1, 1, 2, false); } private static class FlankingGenomeLocTestData extends TestDataProvider { @@ -333,4 +357,153 @@ public class GenomeLocParserUnitTest extends BaseTest { data.toString(), data.original, actual, data.flankStop); assertEquals(actual, data.flankStop, description); } + + @DataProvider(name = "parseGenomeLoc") + public Object[][] makeParsingTest() { + final List tests = new LinkedList(); + + tests.add(new Object[]{ "chr1:10", "chr1", 10 }); + tests.add(new Object[]{ "chr1:100", "chr1", 100 }); + tests.add(new Object[]{ "chr1:1000", "chr1", 1000 }); + tests.add(new Object[]{ "chr1:1,000", "chr1", 1000 }); + tests.add(new Object[]{ "chr1:10000", "chr1", 10000 }); + tests.add(new Object[]{ "chr1:10,000", "chr1", 10000 }); + tests.add(new Object[]{ "chr1:100000", "chr1", 100000 }); + tests.add(new Object[]{ "chr1:100,000", "chr1", 100000 }); + tests.add(new Object[]{ "chr1:1000000", "chr1", 1000000 }); + tests.add(new Object[]{ "chr1:1,000,000", "chr1", 1000000 }); + tests.add(new Object[]{ "chr1:1000,000", "chr1", 1000000 }); + tests.add(new Object[]{ "chr1:1,000000", "chr1", 1000000 }); + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "parseGenomeLoc") + public void testParsingPositions(final String string, final String contig, final int start) { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10000000); + GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + final GenomeLoc loc = genomeLocParser.parseGenomeLoc(string); + Assert.assertEquals(loc.getContig(), contig); + Assert.assertEquals(loc.getStart(), start); + Assert.assertEquals(loc.getStop(), start); + } + + @Test( ) + public void testCreationFromSAMRecord() { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(read); + Assert.assertEquals(loc.getContig(), read.getReferenceName()); + Assert.assertEquals(loc.getContigIndex(), (int)read.getReferenceIndex()); + Assert.assertEquals(loc.getStart(), read.getAlignmentStart()); + Assert.assertEquals(loc.getStop(), read.getAlignmentEnd()); + } + + @Test( ) + public void testCreationFromSAMRecordUnmapped() { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5); + read.setReadUnmappedFlag(true); + read.setReferenceIndex(-1); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(read); + Assert.assertTrue(loc.isUnmapped()); + } + + @Test( ) + public void testCreationFromSAMRecordUnmappedButOnGenome() { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5); + read.setReadUnmappedFlag(true); + read.setCigarString("*"); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(read); + Assert.assertEquals(loc.getContig(), read.getReferenceName()); + Assert.assertEquals(loc.getContigIndex(), (int)read.getReferenceIndex()); + Assert.assertEquals(loc.getStart(), read.getAlignmentStart()); + Assert.assertEquals(loc.getStop(), read.getAlignmentStart()); + } + + @Test + public void testCreationFromFeature() { + final Feature feature = new BasicFeature("chr1", 1, 5); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(feature); + Assert.assertEquals(loc.getContig(), feature.getChr()); + Assert.assertEquals(loc.getStart(), feature.getStart()); + Assert.assertEquals(loc.getStop(), feature.getEnd()); + } + + @Test + public void testCreationFromVariantContext() { + final VariantContext feature = new VariantContextBuilder("x", "chr1", 1, 5, Arrays.asList(Allele.create("AAAAA", true))).make(); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(feature); + Assert.assertEquals(loc.getContig(), feature.getChr()); + Assert.assertEquals(loc.getStart(), feature.getStart()); + Assert.assertEquals(loc.getStop(), feature.getEnd()); + } + + @Test + public void testcreateGenomeLocOnContig() throws FileNotFoundException { + final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMSequenceDictionary dict = seq.getSequenceDictionary(); + final GenomeLocParser genomeLocParser = new GenomeLocParser(dict); + + for ( final SAMSequenceRecord rec : dict.getSequences() ) { + final GenomeLoc loc = genomeLocParser.createOverEntireContig(rec.getSequenceName()); + Assert.assertEquals(loc.getContig(), rec.getSequenceName()); + Assert.assertEquals(loc.getStart(), 1); + Assert.assertEquals(loc.getStop(), rec.getSequenceLength()); + } + } + + @DataProvider(name = "GenomeLocOnContig") + public Object[][] makeGenomeLocOnContig() { + final List tests = new LinkedList(); + + final int contigLength = header.getSequence(0).getSequenceLength(); + for ( int start = -10; start < contigLength + 10; start++ ) { + for ( final int len : Arrays.asList(1, 10, 20) ) { + tests.add(new Object[]{ "chr1", start, start + len }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "GenomeLocOnContig") + public void testGenomeLocOnContig(final String contig, final int start, final int stop) { + final int contigLength = header.getSequence(0).getSequenceLength(); + final GenomeLoc loc = genomeLocParser.createGenomeLocOnContig(contig, start, stop); + + if ( stop < 1 || start > contigLength ) + Assert.assertNull(loc, "GenomeLoc should be null if the start/stops are not meaningful"); + else { + Assert.assertNotNull(loc); + Assert.assertEquals(loc.getContig(), contig); + Assert.assertEquals(loc.getStart(), Math.max(start, 1)); + Assert.assertEquals(loc.getStop(), Math.min(stop, contigLength)); + } + } + + @DataProvider(name = "GenomeLocPadding") + public Object[][] makeGenomeLocPadding() { + final List tests = new LinkedList(); + + final int contigLength = header.getSequence(0).getSequenceLength(); + for ( int pad = 0; pad < contigLength + 1; pad++) { + for ( int start = 1; start < contigLength; start++ ) { + for ( int stop = start; stop < contigLength; stop++ ) { + tests.add(new Object[]{ genomeLocParser.createGenomeLoc("chr1", start, stop), pad}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "GenomeLocPadding") + public void testGenomeLocPadding(final GenomeLoc input, final int pad) { + final int contigLength = header.getSequence(0).getSequenceLength(); + final GenomeLoc padded = genomeLocParser.createPaddedGenomeLoc(input, pad); + + Assert.assertNotNull(padded); + Assert.assertEquals(padded.getContig(), input.getContig()); + Assert.assertEquals(padded.getStart(), Math.max(input.getStart() - pad, 1)); + Assert.assertEquals(padded.getStop(), Math.min(input.getStop() + pad, contigLength)); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java new file mode 100644 index 000000000..7a5fcf0c2 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + + +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.LinkedList; +import java.util.List; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class MRUCachingSAMSequencingDictionaryUnitTest extends BaseTest { + private static ReferenceSequenceFile seq; + private static SAMSequenceDictionary dict; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + dict = seq.getSequenceDictionary(); + } + + @Test + public void testBasic() { + final MRUCachingSAMSequenceDictionary caching = new MRUCachingSAMSequenceDictionary(dict); + + Assert.assertEquals(caching.getDictionary(), dict, "Dictionary not the one I expected"); + + for ( final SAMSequenceRecord rec : dict.getSequences() ) { + Assert.assertFalse(caching.isCached(rec.getSequenceIndex()), "Expected index to not be cached"); + Assert.assertFalse(caching.isCached(rec.getSequenceName()), "Expected contig to not be cached"); + + Assert.assertEquals(caching.getSequence(rec.getSequenceName()), rec, "Couldn't query for sequence"); + Assert.assertEquals(caching.getSequence(rec.getSequenceIndex()), rec, "Couldn't query for sequence index"); + Assert.assertEquals(caching.hasContig(rec.getSequenceName()), true, "hasContig query for sequence"); + Assert.assertEquals(caching.hasContigIndex(rec.getSequenceIndex()), true, "hasContigIndex query for sequence"); + Assert.assertEquals(caching.getSequenceIndex(rec.getSequenceName()), rec.getSequenceIndex(), "Couldn't query for sequence"); + + Assert.assertEquals(caching.hasContig(rec.getSequenceName() + "asdfadsfa"), false, "hasContig query for unknown sequence"); + Assert.assertEquals(caching.hasContigIndex(dict.getSequences().size()), false, "hasContigIndex query for unknown index"); + + Assert.assertTrue(caching.isCached(rec.getSequenceIndex()), "Expected index to be cached"); + Assert.assertTrue(caching.isCached(rec.getSequenceName()), "Expected contig to be cached"); + } + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testBadGetSequence() { + final MRUCachingSAMSequenceDictionary caching = new MRUCachingSAMSequenceDictionary(dict); + caching.getSequence("notInDictionary"); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testBadGetSequenceIndex() { + final MRUCachingSAMSequenceDictionary caching = new MRUCachingSAMSequenceDictionary(dict); + caching.getSequence(dict.getSequences().size()); + } +} \ No newline at end of file